From 74921eee924625426429044decefe3673561b174 Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Wed, 12 Apr 2023 17:43:17 +0100 Subject: Update CPU kernel implementations and guard directives Resolves COMPMID-6023 Change-Id: I868975d14c4f98af6716726feda22405a6a4c891 Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9686 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 28 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 4 + .../NEON/kernels/arm_gemm/gemm_implementation.hpp | 10 +- src/core/NEON/kernels/arm_gemm/gemm_int16.cpp | 3 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 1 + .../arm_gemm/gemm_interleaved_pretransposed_2d.hpp | 566 --- src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp | 7 +- src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp | 7 +- src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp | 3 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 1 + .../a64_interleave4_block16_s8_s8.hpp | 1 - .../a64_interleave4_block16_s8_s8_summing.hpp | 4 +- .../a64_interleave4_block16_u8_u8_summing.hpp | 4 +- .../a64_interleave8_block1_bf16_fp32.hpp | 107 +- .../a64_interleave8_block1_fp16_fp16.hpp | 195 +- .../a64_interleave8_block1_fp16_fp32.hpp | 107 +- .../a64_interleave8_block1_fp32_fp32.hpp | 87 +- .../a64_interleave8_block1_s16_s16.hpp | 195 +- .../a64_interleave8_block1_s16_s16_summing.hpp | 126 +- .../a64_interleave8_block1_s8_s16.hpp | 209 +- .../a64_interleave8_block1_s8_s16_summing.hpp | 152 +- .../a64_interleave8_block1_u16_u16_summing.hpp | 126 +- .../a64_interleave8_block1_u8_u16.hpp | 209 +- .../a64_interleave8_block1_u8_u16_summing.hpp | 152 +- .../a64_interleave8_block2_bf16_bf16.hpp | 125 +- .../a64_interleave8_block2_fp32_fp32.hpp | 81 +- .../a64_interleave8_block4_bf16_bf16.hpp | 145 +- .../a64_interleave8_block4_fp32_bf16.hpp | 89 +- .../a64_interleave8_block4_s8_s8.hpp | 189 +- .../a64_interleave8_block4_s8_s8_summing.hpp | 294 +- .../a64_interleave8_block4_u8_u8_summing.hpp | 294 +- .../a64_interleave8_block8_s8_s8.hpp | 273 +- .../a64_interleave8_block8_s8_s8_summing.hpp | 158 +- .../a64_interleave8_block8_u8_u8_summing.hpp | 158 +- .../sme2_interleave1VL_block2_fp32_bf16.hpp | 8 +- .../sme2_interleave2VL_block2_fp32_bf16.hpp | 8 +- .../sme2_interleave4VL_block2_fp32_bf16.hpp | 8 +- .../sme_interleave1VL_bf16_bf16.hpp | 14 +- .../sme_interleave1VL_block2_bf16_bf16.hpp | 14 +- .../sme_interleave1VL_block4_s8_s8.hpp | 14 +- .../sme_interleave1VL_block4_s8_s8_summing.hpp | 10 +- .../sme_interleave1VL_block4_u8_u8.hpp | 14 +- .../sme_interleave1VL_block4_u8_u8_summing.hpp | 10 +- .../sme_interleave1VL_fp16_fp16.hpp | 14 +- .../sme_interleave1VL_fp32_fp32.hpp | 14 +- .../sme_interleave2VL_bf16_bf16.hpp | 8 +- .../sme_interleave2VL_block2_bf16_bf16.hpp | 176 +- .../sme_interleave2VL_block2_fp16_fp16.hpp | 176 +- .../sme_interleave2VL_block4_s8_s8.hpp | 18 +- .../sme_interleave2VL_block4_s8_s8_summing.hpp | 102 +- .../sme_interleave2VL_block4_u8_u8.hpp | 18 +- .../sme_interleave2VL_block4_u8_u8_summing.hpp | 102 +- .../sme_interleave2VL_fp16_fp16.hpp | 8 +- .../sme_interleave2VL_fp32_fp32.hpp | 152 +- .../sme_interleave4VL_block2_bf16_bf16.hpp | 8 +- .../sme_interleave4VL_block4_s8_s8.hpp | 8 +- .../sme_interleave4VL_block4_s8_s8_summing.hpp | 16 +- .../sme_interleave4VL_block4_u8_u8.hpp | 8 +- .../sme_interleave4VL_block4_u8_u8_summing.hpp | 16 +- .../sme_interleave4VL_fp32_fp32.hpp | 8 +- .../kernels/arm_gemm/interleave_indirect_impl.hpp | 22 +- .../kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp | 12 +- .../a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp | 1876 ++++---- .../kernels/a64_ffhybrid_fp16_mla_6x32.hpp | 12 +- .../kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp | 3274 +++++++------- .../kernels/a64_ffhybrid_fp32_mla_6x16.hpp | 12 +- .../kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp | 1832 ++++---- .../a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp | 12 +- .../generic.cpp | 600 +-- .../a64_ffinterleaved_bf16fp32_dot_8x12.hpp | 12 +- .../generic.cpp | 146 +- .../a64_ffinterleaved_bf16fp32_mmla_8x12.hpp | 12 +- .../generic.cpp | 228 +- .../kernels/a64_ffinterleaved_fp16_mla_8x24.hpp | 12 +- .../a64_ffinterleaved_fp16_mla_8x24/generic.cpp | 144 +- .../kernels/a64_ffinterleaved_fp32_mla_8x12.hpp | 12 +- .../a64_ffinterleaved_fp32_mla_8x12/generic.cpp | 222 +- .../kernels/a64_hybrid_bf16fp32_dot_6x16.hpp | 15 +- .../a64_hybrid_bf16fp32_dot_6x16/generic.cpp | 1988 +++++---- .../kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp | 15 +- .../a64_hybrid_bf16fp32_mmla_6x16/generic.cpp | 1892 +++++---- .../arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp | 19 +- .../kernels/a64_hybrid_fp16_mla_6x32/a55.cpp | 4391 ++++++++++--------- .../kernels/a64_hybrid_fp16_mla_6x32/generic.cpp | 3349 ++++++++------- .../arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp | 15 +- .../kernels/a64_hybrid_fp32_mla_4x24/a55.cpp | 1628 ++++--- .../kernels/a64_hybrid_fp32_mla_4x24/generic.cpp | 1120 +++-- .../arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp | 15 +- .../kernels/a64_hybrid_fp32_mla_6x16/a55.cpp | 2424 ++++++----- .../kernels/a64_hybrid_fp32_mla_6x16/generic.cpp | 1866 ++++---- .../arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp | 15 +- .../kernels/a64_hybrid_fp32_mla_8x4/a55.cpp | 2786 ++++++------ .../kernels/a64_hybrid_fp32_mla_8x4/generic.cpp | 338 +- .../kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp | 15 +- .../a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp | 602 ++- .../kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp | 15 +- .../a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp | 942 +++-- .../arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp | 15 +- .../kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp | 2632 ++++++------ .../kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp | 1612 ++++--- .../arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp | 15 +- .../kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp | 1616 ++++--- .../arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp | 15 +- .../kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp | 4462 ++++++++++---------- .../kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp | 3620 ++++++++-------- .../arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp | 15 +- .../kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp | 3562 ++++++++-------- .../arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp | 21 +- .../kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp | 2542 ++++++----- .../kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp | 1810 ++++---- .../kernels/a64_hybrid_s8s32_mmla_6x16.hpp | 17 +- .../kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp | 1704 ++++---- .../arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp | 17 +- .../kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp | 2632 ++++++------ .../kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp | 1612 ++++--- .../arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp | 17 +- .../kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp | 1616 ++++--- .../arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp | 15 +- .../kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp | 2542 ++++++----- .../kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp | 1810 ++++---- .../kernels/a64_hybrid_u8u32_mmla_6x16.hpp | 15 +- .../kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp | 1704 ++++---- .../kernels/a64_interleaved_bf16fp32_dot_8x12.hpp | 20 +- .../a64_interleaved_bf16fp32_dot_8x12/generic.cpp | 121 +- .../kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp | 20 +- .../a64_interleaved_bf16fp32_mmla_8x12/a510.cpp | 201 +- .../a64_interleaved_bf16fp32_mmla_8x12/generic.cpp | 201 +- .../kernels/a64_interleaved_s8s32_mmla_8x12.hpp | 21 +- .../a64_interleaved_s8s32_mmla_8x12/a510.cpp | 201 +- .../a64_interleaved_s8s32_mmla_8x12/generic.cpp | 201 +- .../kernels/a64_interleaved_u8u32_mmla_8x12.hpp | 20 +- .../a64_interleaved_u8u32_mmla_8x12/a510.cpp | 201 +- .../a64_interleaved_u8u32_mmla_8x12/generic.cpp | 201 +- .../kernels/sme2_gemv_bf16fp32_dot_16VL.hpp | 17 +- .../sme2_gemv_bf16fp32_dot_16VL/generic.cpp | 541 ++- .../arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp | 17 +- .../kernels/sme2_gemv_fp32_mla_16VL/generic.cpp | 541 ++- .../kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp | 17 +- .../sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp | 679 ++- .../arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp | 17 +- .../kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp | 751 ++-- .../arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp | 17 +- .../kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp | 751 ++-- ...2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp | 9 +- .../generic.cpp | 248 +- ...2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp | 9 +- .../generic.cpp | 252 +- ...2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp | 9 +- .../generic.cpp | 300 +- .../sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp | 9 +- .../generic.cpp | 248 +- .../sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp | 9 +- .../generic.cpp | 252 +- .../sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp | 9 +- .../generic.cpp | 300 +- .../sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp | 9 +- .../generic.cpp | 298 +- .../sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp | 9 +- .../generic.cpp | 296 +- .../sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp | 9 +- .../generic.cpp | 326 +- ...sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp | 13 +- .../generic.cpp | 202 +- ...sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp | 13 +- .../generic.cpp | 216 +- ...sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp | 13 +- .../generic.cpp | 264 +- .../sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp | 9 +- .../generic.cpp | 298 +- .../sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp | 9 +- .../generic.cpp | 296 +- .../sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp | 9 +- .../generic.cpp | 326 +- .../kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp | 12 +- .../sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp | 1934 ++++----- .../kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp | 12 +- .../kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp | 678 +-- .../sve_ffhybrid_fp16_mla_6x4VL/generic.cpp | 4010 +++++++++--------- .../kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp | 12 +- .../kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp | 678 +-- .../sve_ffhybrid_fp32_mla_6x4VL/generic.cpp | 2282 +++++----- .../sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp | 12 +- .../generic.cpp | 1032 ++--- .../sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp | 12 +- .../generic.cpp | 288 +- .../kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp | 12 +- .../sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp | 194 +- .../sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp | 146 +- .../kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp | 12 +- .../sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp | 194 +- .../sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp | 148 +- .../kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp | 16 +- .../sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp | 2283 +++++----- .../kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp | 16 +- .../sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp | 1936 ++++----- .../arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp | 18 +- .../kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp | 679 ++- .../kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp | 4011 +++++++++--------- .../arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp | 24 +- .../kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp | 679 ++- .../kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp | 2283 +++++----- .../arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp | 15 +- .../kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp | 475 ++- .../kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp | 1067 +++-- .../kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp | 16 +- .../sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp | 1033 +++-- .../kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp | 16 +- .../sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp | 1473 ++++--- .../arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp | 16 +- .../kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp | 1663 ++++---- .../kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp | 16 +- .../kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp | 1513 ++++--- .../arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp | 16 +- .../kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp | 3271 +++++++------- .../kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp | 16 +- .../kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp | 2829 +++++++------ .../kernels/sve_hybrid_s8s32_dot_6x4VL.hpp | 31 +- .../kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp | 467 +- .../kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp | 2071 +++++---- .../kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp | 17 +- .../sve_hybrid_s8s32_mmla_6x4VL/generic.cpp | 1809 ++++---- .../arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp | 16 +- .../kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp | 1663 ++++---- .../kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp | 16 +- .../kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp | 1513 ++++--- .../kernels/sve_hybrid_u8u32_dot_6x4VL.hpp | 31 +- .../kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp | 467 +- .../kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp | 2071 +++++---- .../kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp | 17 +- .../sve_hybrid_u8u32_mmla_6x4VL/generic.cpp | 1809 ++++---- .../kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp | 20 +- .../sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp | 122 +- .../sve_interleaved_bf16fp32_mmla_8x3VL.hpp | 20 +- .../generic.cpp | 258 +- .../kernels/sve_interleaved_fp16_mla_8x3VL.hpp | 22 +- .../sve_interleaved_fp16_mla_8x3VL/a64fx.cpp | 182 +- .../sve_interleaved_fp16_mla_8x3VL/generic.cpp | 120 +- .../kernels/sve_interleaved_fp32_mla_8x3VL.hpp | 28 +- .../sve_interleaved_fp32_mla_8x3VL/a64fx.cpp | 182 +- .../sve_interleaved_fp32_mla_8x3VL/generic.cpp | 120 +- .../kernels/sve_interleaved_s8s32_dot_8x3VL.hpp | 34 +- .../sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp | 182 +- .../sve_interleaved_s8s32_dot_8x3VL/generic.cpp | 120 +- .../kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp | 22 +- .../sve_interleaved_s8s32_mmla_8x3VL/generic.cpp | 258 +- .../kernels/sve_interleaved_u8u32_dot_8x3VL.hpp | 34 +- .../sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp | 182 +- .../sve_interleaved_u8u32_dot_8x3VL/generic.cpp | 120 +- .../kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp | 20 +- .../sve_interleaved_u8u32_mmla_8x3VL/generic.cpp | 258 +- src/core/NEON/kernels/arm_gemm/misc.cpp | 6 +- src/core/NEON/kernels/arm_gemm/quantized.hpp | 4 +- .../NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp | 2 +- .../NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp | 2 +- .../NEON/kernels/arm_gemm/std_transforms_sme.hpp | 5 +- src/core/NEON/kernels/arm_gemm/transform.cpp | 4 +- .../transforms/a64_transpose_interleave_128.hpp | 7 +- .../transforms/a64_transpose_interleave_12_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_12_1x8.hpp | 6 +- .../transforms/a64_transpose_interleave_12_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_12_2x4.hpp | 7 +- .../a64_transpose_interleave_12_2x4_fp32bf16.hpp | 6 +- .../a64_transpose_interleave_12_s8s16.hpp | 7 +- .../a64_transpose_interleave_12_u8u16.hpp | 7 +- .../transforms/a64_transpose_interleave_16.hpp | 5 +- .../transforms/a64_transpose_interleave_16_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_16_1x8.hpp | 6 +- .../transforms/a64_transpose_interleave_16_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_16_2x4.hpp | 7 +- .../a64_transpose_interleave_16_2x4_fp32bf16.hpp | 7 +- .../transforms/a64_transpose_interleave_24.hpp | 6 +- .../a64_transpose_interleave_24_2x4_fp32bf16.hpp | 6 +- .../a64_transpose_interleave_24_bf16fp32.hpp | 7 +- .../a64_transpose_interleave_24_fp16fp32.hpp | 6 +- .../transforms/a64_transpose_interleave_32_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_32_2x2.hpp | 7 +- .../transforms/a64_transpose_interleave_48.hpp | 6 +- .../transforms/a64_transpose_interleave_4_1x16.hpp | 6 +- .../transforms/a64_transpose_interleave_4_1x4.hpp | 5 +- .../transforms/a64_transpose_interleave_64.hpp | 6 +- .../transforms/a64_transpose_interleave_96.hpp | 6 +- .../transforms/sme_transpose_interleave_16VL.hpp | 6 +- .../sme_transpose_interleave_16VL_1x4.hpp | 6 +- .../sme_transpose_interleave_16VL_2x2.hpp | 6 +- .../sme_transpose_interleave_16VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_1VL.hpp | 6 +- .../sme_transpose_interleave_1VL_1x4.hpp | 6 +- .../sme_transpose_interleave_1VL_2x2.hpp | 5 +- .../sme_transpose_interleave_1VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_2VL.hpp | 6 +- .../sme_transpose_interleave_2VL_1x4.hpp | 6 +- .../sme_transpose_interleave_2VL_2x2.hpp | 6 +- .../sme_transpose_interleave_2VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_4VL.hpp | 6 +- .../sme_transpose_interleave_4VL_1x4.hpp | 6 +- .../sme_transpose_interleave_4VL_2x2.hpp | 6 +- .../sme_transpose_interleave_4VL_2x2_fp32bf16.hpp | 6 +- .../transforms/sme_transpose_interleave_8VL.hpp | 208 + .../sme_transpose_interleave_8VL_1x4.hpp | 143 + .../sme_transpose_interleave_8VL_2x2.hpp | 132 + .../sve_transpose_interleave_12VL_2x4_fp32bf16.hpp | 6 +- .../transforms/sve_transpose_interleave_1VL.hpp | 8 +- .../sve_transpose_interleave_1VL_1x4.hpp | 6 +- .../transforms/sve_transpose_interleave_3VL.hpp | 8 +- .../sve_transpose_interleave_3VL_1x4.hpp | 7 +- .../sve_transpose_interleave_3VL_2x2.hpp | 7 +- .../transforms/sve_transpose_interleave_4VL.hpp | 8 +- .../sve_transpose_interleave_4VL_1x4.hpp | 6 +- .../sve_transpose_interleave_4VL_2x2.hpp | 8 +- .../sve_transpose_interleave_6VL_1x8.hpp | 6 +- .../sve_transpose_interleave_6VL_2x4.hpp | 8 +- .../sve_transpose_interleave_6VL_2x4_fp32bf16.hpp | 6 +- .../sve_transpose_interleave_6VL_4x2.hpp | 7 +- .../transforms/sve_transpose_interleave_8VL.hpp | 7 +- .../sve_transpose_interleave_8VL_1x4.hpp | 6 +- .../sve_transpose_interleave_8VL_1x8.hpp | 6 +- .../sve_transpose_interleave_8VL_2x2.hpp | 7 +- .../sve_transpose_interleave_8VL_2x4.hpp | 7 +- .../sve_transpose_interleave_8VL_2x4_fp32bf16.hpp | 6 +- src/core/NEON/kernels/arm_gemm/utils.hpp | 5 +- 320 files changed, 66812 insertions(+), 67044 deletions(-) delete mode 100644 src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp (limited to 'src/core/NEON/kernels/arm_gemm') diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 515d55c73b..2d743a4bd6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, 2022 Arm Limited. + * Copyright (c) 2017-2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,8 @@ #include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp" #endif // ARM_COMPUTE_ENABLE_SME2 +#include "kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp" +#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp" #include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp" #include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp" #include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp" @@ -204,6 +206,30 @@ GemmImplementation::with_estimate( [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } ), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_ffinterleaved_bf16fp32_mmla_8x12", + KernelWeightFormat::VL256_BL64, + [](const GemmArgs &args) { return args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_ffhybrid_bf16fp32_mmla_6x16", + KernelWeightFormat::VL256_BL64, + [](const GemmArgs &args) { return args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat(args); } +), +GemmImplementation::with_estimate( + GemmMethod::GEMM_INTERLEAVED, + "a64_ffinterleaved_bf16fp32_dot_8x12", + KernelWeightFormat::VL128_BL32, + [](const GemmArgs &args) { return args._ci->has_bf16(); }, + [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, + [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } +), #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index ee567a2498..44a7bb894a 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -66,6 +66,10 @@ #include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp" #endif // ARM_COMPUTE_ENABLE_SME2 +#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp" +#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp" +#include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp" +#include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp" #include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp" #include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp" #include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp index 19c8fcadd3..5e77df7d4a 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, 2022 Arm Limited. + * Copyright (c) 2018-2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -212,9 +212,11 @@ struct GemmImplementation { instantiate(instantiate) { } }; -/* "Main" function implemented for each valid combination of types. - * Returns a list of GEMM implementation descriptors for processing by the - * other functions, ended by an implementation with +/* Provides the list of implementation descriptors which is processed by the + * other functions. + * + * A specialised version is provided for each supported combination of types. + * The end of the list is indicated by a sentinel descriptor with * method==GemmMethod::DEFAULT. */ template const GemmImplementation *gemm_implementation_list(); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp index 18d8fc9312..aa6ecc2919 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, 2022 Arm Limited. + * Copyright (c) 2017-2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,7 @@ const GemmImplementation *gemm_implementation_list gemm(const GemmArgs &args, const Nothing &); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Nothing &); +template KernelDescription get_gemm_method(const GemmArgs &args, const Nothing &); template std::vector get_compatible_kernels (const GemmArgs &args, const Nothing &); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index b0a01886d2..fd20e53f60 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -186,6 +186,7 @@ const GemmImplementation *gemm_implementation_list gemm(const GemmArgs &args, const Nothing &); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Nothing &); +template KernelDescription get_gemm_method(const GemmArgs &args, const Nothing &); template std::vector get_compatible_kernels (const GemmArgs &args, const Nothing &); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp deleted file mode 100644 index b71f390ab9..0000000000 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include "arm_gemm.hpp" -#include "utils.hpp" - -#include "mergeresults.hpp" -#include "transform.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -#include -#include -#include - -// Some macros used to decide how much working space to allocate. -// Round allocations up to the next cache line. -#define ALLOC_ROUND 64 -#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) - -// Implementation of the GemmCommon abstract class. -// -// This implementation interleaves the source matrices in blocks - good for -// larger matrices. -namespace arm_gemm { - -template -class GemmInterleavedPretransposed2d : public GemmCommon { - typedef typename strategy::operand_type Toi; - typedef typename strategy::result_type Tri; - - /* const properties set by constructor */ - const CPUInfo * const _ci; - - const unsigned int _Msize; - const unsigned int _Nsize; - const unsigned int _Ksize; - - const unsigned int _nbatches; - const unsigned int _nmulti; - - const Activation _act; - - const int _maxthreads; - int _nthreads; - - /* Blocking info */ - unsigned int _k_block=0; - unsigned int _x_block=0; - - unsigned int _Mround_div=0; - unsigned int _Mround=0; - unsigned int _Nround_div=0; - unsigned int _Nround=0; - - /* Working space, pretransposed buffer */ - const Toi *_B_transposed=nullptr; - void *_working_space=nullptr; - - /* We will need to walk through the blocks of B in a few contexts, so - * factor that out. */ - class blockwalker { - private: - /* Size loops, etc. based on our parent's configuration */ - const GemmInterleavedPretransposed2d &_parent; - - /* K, X and multi parameters for current iteration. */ - unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0; - - unsigned int _index=0; - bool _done=false; - bool _newkblock=true; - bool _newmulti=true; - - public: - blockwalker(const GemmInterleavedPretransposed2d &parent) - : _parent(parent) - , _xmax { parent._Nsize } - { } - - blockwalker(const GemmInterleavedPretransposed2d &parent, unsigned int x0, unsigned int xmax) - : _parent(parent) - , _x0 { x0 } - , _xmin { x0 } - , _xmax { xmax } - { - assert(_x0 <= _xmax); - } - - unsigned int xmax() { - return std::min(_x0 + _parent._x_block, _xmax); - } - - unsigned int kmax() { - return std::min(_k0 + _parent._k_block, _parent._Ksize); - } - - /* Advance to the next block, return false at the end. */ - bool advance(void) { - if (_done) { - return false; - } - - _newkblock=false; - _x0 += _parent._x_block; - if (_x0 >= _xmax) { - _x0=_xmin; - _k0 += _parent._k_block; - if (_k0 >= _parent._Ksize) { - _k0=0; - _multi++; - if (_multi >= _parent._nmulti) { - _done=true; - return false; - } - _newmulti=true; - } - _newkblock=true; - } - _index++; - - return true; - } - - unsigned int k0(void) { return _k0; } - unsigned int x0(void) { return _x0; } - unsigned int multi(void) { return _multi; } - unsigned int index(void) { return _index; } - bool done(void) { return _done; } - bool newkblock(void) { return _newkblock; } - }; - - // A working size: One of these needed, regardless of thread count. Divided according to window. - size_t get_a_working_size() const { - return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2; - } - - // As B will be pretranspose we do not need to alloc any space for it - size_t get_b_working_size() const { - return 0; - } - - // C working size: One needed per thread. - size_t get_c_working_size() const { - return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height()); - } - - // Internal execute function. - // This supports both the "pretransposed" and "standard" interfaces via the template parameter. - void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) { - /* Make sure we've been set up correctly. */ - assert(_B_transposed); - assert(_working_space); - assert(this->_Aptr); - assert(this->_Cptr); - -#ifdef CYCLE_PROFILING - profiler prof; -#endif - strategy strat(_ci); - - /* Translate 'start' and 'end' into a position within the batches and rows. */ - const unsigned int window_per_batch = _Mround / strategy::out_height(); - unsigned int batch_0 = m_start / window_per_batch; - unsigned int batch_end = m_end / window_per_batch; - - /* Compute the M values to operate on */ - unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height(); - unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height(); - - unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start); - unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end); - - blockwalker current(*this, n_0, n_max); - - int8_t *working_space_bytes = reinterpret_cast(_working_space); - - auto c_panel_start = working_space_bytes; - auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads; - - auto c_panel = reinterpret_cast(c_panel_start + get_c_working_size() * threadid); - auto a_panel = reinterpret_cast(a_panel_start + get_a_working_size() * threadid); - - /* B^t is stored in interleaved panels separated by their K-block component - * we want to store a pointer to the start of the current k-page - * then when we come to the next k-block we just add the size of the previous to - * this base pointer - */ - const Toi *b_panel_start = _B_transposed; - // b_panels stores a pointer to the start of our current block inside of the k-block - const Toi *b_panel = b_panel_start; - - // newkblock() is always true on the first iteration, so this will be set properly on the first loop. - unsigned b_page_size = 0; - int kern_k = 0; - for (;!current.done();current.advance()) { - int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); - - if (current.newkblock()) { - kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); - kern_k *= strat.k_unroll(); - - unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width()); - - b_panel_start += b_page_size; - b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k); - b_page_size = _Nround * kern_k; - - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; - - if (first_m >= last_m) - continue; - - auto a_thread_panel_in = this->_Aptr - + (batch * this->_A_batch_stride) - + (current.multi() * this->_A_multi_stride); - - auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block); - - strat.transforms.PrepareA( - a_thread_panel_out, - a_thread_panel_in, - this->_lda, - first_m, - last_m, - current.k0(), - current.kmax(), - 0); - } - } - - /* Do the actual work. */ - for (unsigned int batch = batch_0; batch <= batch_end; batch++) { - unsigned int first_m = (batch == batch_0) ? m_0 : 0; - unsigned int last_m = (batch == batch_end) ? m_max : _Msize; - - const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block; - - if (first_m >= last_m) - continue; - - for (unsigned int y=first_m; y_Cptr - + this->_C_batch_stride * batch - + this->_C_multi_stride * current.multi(); - - auto bias = (first_pass && this->_bias) - ? this->_bias + (current.multi() * this->_bias_multi_stride) - : nullptr; - - auto act = last_pass ? _act : Activation(); - - strat.transforms.Merge( - c_panel_out, - c_panel, - this->_ldc, - y, - ymax, - current.x0(), - current.xmax(), - bias, - act, - !first_pass); //Append - } - } - - b_panel += (bblocks * strat.out_width() * kern_k); - } - } - - static unsigned int get_k_block_size(const GemmArgs &args) { - // Work out blocking parameters, or override from provided GemmConfig - if (args._cfg && args._cfg->inner_block_size) { - return args._cfg->inner_block_size; - } - - const unsigned int L1_size = args._ci->get_L1_cache_size(); - unsigned int k_block; - - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - k_block /= strategy::k_unroll(); - k_block = std::max(k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - unsigned int numk_blocks = iceildiv(args._Ksize, k_block); - - // So divide the space equally into that many blocks. - k_block = iceildiv(args._Ksize, numk_blocks); - - // And round UP to the K unroll level required. - k_block = iceildiv(k_block, strategy::k_unroll()); - k_block *= strategy::k_unroll(); - - return k_block; - } - -public: - GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete; - GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete; - - /* Constructor */ - GemmInterleavedPretransposed2d(const GemmArgs &args) - : _ci(args._ci) - , _Msize(args._Msize) - , _Nsize(args._Nsize) - , _Ksize(args._Ksize) - , _nbatches(args._nbatches) - , _nmulti(args._nmulti) - , _act(args._act) - , _maxthreads(args._maxthreads) - , _nthreads(args._maxthreads) - , _k_block(get_k_block_size(args)) - // Work out the rounded size of M - needed for some buffers. - , _Mround_div ( iceildiv(_Msize, strategy::out_height()) ) - , _Mround ( _Mround_div * strategy::out_height() ) - - , _Nround_div ( iceildiv(_Nsize, strategy::out_width()) ) - , _Nround ( _Nround_div * strategy::out_width() ) - { - assert(_maxthreads > 0); - - const unsigned int L2_size = _ci->get_L2_cache_size(); - - if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); - - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); - - // And tune to the presented problem size. - unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); - - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); - } - } - - // Interface implementation - Compulsory functions - ndrange_t get_window_size() const override { - unsigned m = (_Mround / strategy::out_height()) * _nbatches; - unsigned n = _Nround_div; - - return { m, n }; - } - - bool supports_dynamic_scheduling() const override { - return true; - } - - // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads. - void set_nthreads(int nthreads) override { - _nthreads = std::min(nthreads, _maxthreads); - } - - void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override { - /* This particular GEMM implementation can only be broken up over the M & N - * dimensions, we inform the frame work of this limitation via the get_window_size function - */ - const auto m_start = work_range.get_position(0); - const auto n_start = work_range.get_position(1); - const auto m_size = work_range.get_size(0); - const auto n_size = work_range.get_size(1); - const auto m_end = m_start + m_size; - const auto n_end = n_start + n_size; - - const auto m_threadid = thread_locator.get_position(0); - const auto n_threadid = thread_locator.get_position(1); - - execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid); - } - - std::size_t get_working_size() const override { - /* Because we do not know how schedular will break up - * the task, we need to ensure that alloc enough - * space to be able to handle the case where every thread - * is parallelised across B AND also every thrread is parallelised across A - * - * If we parallelise across A, then we only need one buffer of A and 64 buffers of B - * If we parallelise across B, then we only need 64 buffer of B and - */ - return get_c_working_size() * _maxthreads - + get_a_working_size() * _maxthreads - + 64; //to account for cacheline alignment - } - - - void set_working_space(void *working_space) override { - // Make sure everything ends up cache line aligned - int8_t *working_space_bytes = reinterpret_cast(working_space); - intptr_t working_space_int = reinterpret_cast(working_space); - - size_t diff=0; - - if (working_space_int & 0x3F) { - diff = 0x40 - (working_space_int & 0x3F); - } - - working_space_bytes += diff; - - _working_space = reinterpret_cast(working_space_bytes); - } - - // Interface implementation - pretransposed - bool B_is_pretransposed() const override { - return true; - } - - bool B_pretranspose_required() const override { - return _B_transposed==nullptr; - } - - // TODO: this could almost certainly be considerably simpler. - size_t get_B_pretransposed_array_size() const override { - size_t total=0; - blockwalker current(*this); - - do { - /* Figure out the size of each block. */ - unsigned int x_size = (current.xmax() - current.x0()); - unsigned int k_size = (current.kmax() - current.k0()); - - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); - - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); - - total += x_size * k_size * sizeof(Toi); - } while (current.advance()); - - return total; - } - - void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { - blockwalker current(*this); - Toi *buffer = reinterpret_cast(in_buffer); - _B_transposed = buffer; - strategy strat(_ci); - - do { - /* Figure out the size of each block. */ - unsigned int x_size = (current.xmax() - current.x0()); - unsigned int k_size = (current.kmax() - current.k0()); - - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); - - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); - - strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax()); - - buffer += (x_size * k_size); - } while (current.advance()); - } - - void set_pretransposed_B_data(void *in_buffer) override { - _B_transposed = reinterpret_cast(in_buffer); - } - - // Estimate cycles for given problem given provided parameters - static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { - unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args)); - unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches; - unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width()); - - uint64_t total_macs = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); - uint64_t prepare_bytes = static_cast(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi); - uint64_t merge_bytes = static_cast(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr); - - // Wide problems incur extra preparation cost, as it is done per thread. - // Duplicate the logic the scheduler will later use to figure out how much that will affect us - float ratio = m_blocks / static_cast(n_blocks); - - unsigned int ideal_height = static_cast(std::sqrt(args._maxthreads * ratio) + 0.5); - unsigned int height = 1; - - if (ideal_height == 0) { - height = 1; - } else { - for (unsigned int adj=0; adj(total_macs) / params.kernel_macs_cycle; - float prepare_cycles = static_cast(prepare_bytes) / params.prepare_bytes_cycle; - float merge_cycles = static_cast(merge_bytes) / params.merge_bytes_cycle; - - float total_cycles = mac_cycles + prepare_cycles + merge_cycles; - - // We can't thread over multis, which might be a problem in some - // threaded cases. Penalize that here. - float parallelism_available = static_cast(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9; - - if (parallelism_available < args._maxthreads) { - total_cycles *= (static_cast(args._maxthreads) / parallelism_available); - } - - return static_cast(total_cycles); - } -}; - -} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp index 9e8907d60f..c725815859 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp @@ -80,7 +80,7 @@ static const GemmImplementation gemm_qint8_methods { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_s8q_mopa_1VLx4VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length(); return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } @@ -88,7 +88,7 @@ static const GemmImplementation gemm_qint8_methods { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_s8q_mopa_4VLx1VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length(); return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } @@ -96,7 +96,7 @@ static const GemmImplementation gemm_qint8_methods { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_s8q_mopa_2VLx2VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } }, @@ -265,6 +265,7 @@ const GemmImplementation *gemm_implementation_list template UniqueGemmCommon gemm(const GemmArgs &args, const Requantize32 &os); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os); +template KernelDescription get_gemm_method(const GemmArgs &args, const Requantize32 &os); template std::vector get_compatible_kernels(const GemmArgs &args, const Requantize32 &os); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp index f93f56b57d..6254ec668d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp @@ -76,7 +76,7 @@ static const GemmImplementation gemm_quint8_meth { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_u8q_mopa_1VLx4VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length(); return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } @@ -84,7 +84,7 @@ static const GemmImplementation gemm_quint8_meth { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_u8q_mopa_4VLx1VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, [](const GemmArgs &args, const Requantize32 &) { const auto VL = sme::get_vector_length(); return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } @@ -92,7 +92,7 @@ static const GemmImplementation gemm_quint8_meth { GemmMethod::GEMM_INTERLEAVED, "sme2_interleaved_nomerge_u8q_mopa_2VLx2VL", - [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sme2() && args._maxthreads == 1 && ((qp.per_channel_requant && (qp.per_channel_left_shifts == nullptr)) || (!qp.per_channel_requant && (qp.per_layer_left_shift == 0)));}, nullptr, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedPretransposedNoMergeQuantizedInline(args, qp); } }, @@ -233,6 +233,7 @@ const GemmImplementation *gemm_implementation_li template UniqueGemmCommon gemm(const GemmArgs &args, const Requantize32 &os); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Requantize32 &os); +template KernelDescription get_gemm_method(const GemmArgs &args, const Requantize32 &os); template std::vector get_compatible_kernels(const GemmArgs &args, const Requantize32 &os); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp index fc836f9790..25b6cf0cf2 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, 2022 Arm Limited. + * Copyright (c) 2017-2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,7 @@ const GemmImplementation *gemm_implementation_list gemm(const GemmArgs &args, const Nothing &); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Nothing &); +template KernelDescription get_gemm_method(const GemmArgs &args, const Nothing &); template std::vector get_compatible_kernels(const GemmArgs &args, const Nothing &); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index fcc95eb503..af5cfbbf2b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -150,6 +150,7 @@ const GemmImplementation *gemm_implementation_list gemm(const GemmArgs &args, const Nothing &); template bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const Nothing &); +template KernelDescription get_gemm_method(const GemmArgs &args, const Nothing &); template std::vector get_compatible_kernels (const GemmArgs &args, const Nothing &); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp index 4dfe46446e..e4bfc0f6e4 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp @@ -170,7 +170,6 @@ void interleave_block<4, 16, VLType::None, false>( "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "12:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "x20", "x21", "x22", "x23" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp index 56ca49a36e..23800edf20 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp @@ -210,8 +210,8 @@ void interleave_block<4, 16, VLType::None, true>( "sadalp v22.4s, v26.8h\n" "sadalp v21.4s, v25.8h\n" "addp v24.4s, v24.4s, v23.4s\n" - "addp v23.4s, v22.4s, v21.4s\n" - "addp v24.4s, v24.4s, v23.4s\n" + "addp v16.4s, v22.4s, v21.4s\n" + "addp v24.4s, v24.4s, v16.4s\n" "add v24.4s, v24.4s, v20.4s\n" "str q24, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp index 4c7bb71fb2..15545c24db 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp @@ -210,8 +210,8 @@ void interleave_block<4, 16, VLType::None, true>( "uadalp v22.4s, v26.8h\n" "uadalp v21.4s, v25.8h\n" "addp v24.4s, v24.4s, v23.4s\n" - "addp v23.4s, v22.4s, v21.4s\n" - "addp v24.4s, v24.4s, v23.4s\n" + "addp v16.4s, v22.4s, v21.4s\n" + "addp v24.4s, v24.4s, v16.4s\n" "add v24.4s, v24.4s, v20.4s\n" "str q24, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp index 2ba2aa854a..b900c330b7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp @@ -80,36 +80,36 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d28, [x28], #0x8\n" - "ldr d27, [x27], #0x8\n" - "shll v28.4s, v28.4h, #0x10\n" + "ldr d27, [x28], #0x8\n" + "ldr d26, [x27], #0x8\n" "shll v27.4s, v27.4h, #0x10\n" + "shll v26.4s, v26.4h, #0x10\n" "ldr d22, [x26], #0x8\n" "ldr d21, [x25], #0x8\n" "shll v22.4s, v22.4h, #0x10\n" "shll v21.4s, v21.4h, #0x10\n" - "ldr d26, [x24], #0x8\n" + "ldr d20, [x24], #0x8\n" "ldr d25, [x23], #0x8\n" - "shll v26.4s, v26.4h, #0x10\n" - "shll v25.4s, v25.4h, #0x10\n" - "ldr d20, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" "shll v20.4s, v20.4h, #0x10\n" + "shll v25.4s, v25.4h, #0x10\n" + "ldr d19, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" "shll v19.4s, v19.4h, #0x10\n" - "zip1 v24.4s, v28.4s, v22.4s\n" - "zip1 v23.4s, v27.4s, v21.4s\n" + "shll v16.4s, v16.4h, #0x10\n" + "zip1 v24.4s, v27.4s, v22.4s\n" + "zip1 v23.4s, v26.4s, v21.4s\n" "subs %x[width], %x[width], #0x4\n" "cmp %x[width], #0x4\n" - "zip1 v18.4s, v26.4s, v20.4s\n" - "zip1 v17.4s, v25.4s, v19.4s\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "zip1 v17.4s, v25.4s, v16.4s\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "zip2 v21.4s, v26.4s, v21.4s\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v20.4s, v26.4s, v20.4s\n" - "zip2 v19.4s, v25.4s, v19.4s\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v19.4s, v25.4s, v16.4s\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "prfm pldl1keep, [x22, #0x70]\n" @@ -138,71 +138,70 @@ void interleave_block<8, 1, VLType::None, false>( "ldr s28, [x28], #0x4\n" "ldr s27, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s22, [x26], #0x4\n" - "ldr s21, [x25], #0x4\n" - "ldr s26, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s26, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s24, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" "tbz %x[width], #0, 5f\n" "ld1 { v28.h }[2], [x28]\n" "ld1 { v27.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v22.h }[2], [x26]\n" - "ld1 { v21.h }[2], [x25]\n" - "ld1 { v26.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v20.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" + "ld1 { v26.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v24.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "ld1 { v22.h }[2], [x22]\n" + "ld1 { v21.h }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 "ldr h28, [x28, #0x0]\n" "ldr h27, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h22, [x26, #0x0]\n" - "ldr h21, [x25, #0x0]\n" - "ldr h26, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h20, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" + "ldr h26, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h24, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "ldr h21, [x21, #0x0]\n" "5:" // Odd load end "shll v28.4s, v28.4h, #0x10\n" "shll v27.4s, v27.4h, #0x10\n" "subs x20, x20, #0x1\n" - "shll v22.4s, v22.4h, #0x10\n" - "shll v21.4s, v21.4h, #0x10\n" "shll v26.4s, v26.4h, #0x10\n" "shll v25.4s, v25.4h, #0x10\n" - "shll v20.4s, v20.4h, #0x10\n" - "shll v19.4s, v19.4h, #0x10\n" - "zip1 v24.4s, v28.4s, v22.4s\n" - "zip1 v23.4s, v27.4s, v21.4s\n" - "zip1 v18.4s, v26.4s, v20.4s\n" - "zip1 v17.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v24.4s, v23.4s\n" + "shll v24.4s, v24.4h, #0x10\n" + "shll v23.4s, v23.4h, #0x10\n" + "shll v22.4s, v22.4h, #0x10\n" + "shll v21.4s, v21.4h, #0x10\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" "subs x20, x20, #0x1\n" - "zip2 v16.4s, v24.4s, v23.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v17.4s, v18.4s, v17.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "zip2 v20.4s, v26.4s, v20.4s\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v22.4s, v21.4s\n" + "zip2 v19.4s, v28.4s, v26.4s\n" + "zip2 v16.4s, v27.4s, v25.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v16.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v20.4s, v19.4s\n" - "str q18, [%x[out_ptr], #0x10]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp index f55c2be4a4..e54b3b9f41 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp @@ -80,33 +80,33 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q25, [x28], #0x10\n" - "ldr q30, [x27], #0x10\n" + "ldr q27, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" - "ldr q29, [x26], #0x10\n" - "ldr q28, [x25], #0x10\n" + "ldr q26, [x26], #0x10\n" + "ldr q24, [x25], #0x10\n" "ldr q21, [x24], #0x10\n" - "ldr q27, [x23], #0x10\n" + "ldr q20, [x23], #0x10\n" "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v26.8h, v30.8h, v27.8h\n" - "ldr q20, [x22], #0x10\n" - "ldr q22, [x21], #0x10\n" - "zip1 v19.8h, v29.8h, v20.8h\n" - "zip1 v18.8h, v28.8h, v22.8h\n" + "zip1 v22.8h, v27.8h, v20.8h\n" + "ldr q17, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v19.8h, v26.8h, v17.8h\n" + "zip1 v18.8h, v24.8h, v16.8h\n" "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v21.8h, v26.8h, v17.8h\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v22.8h\n" + "zip2 v20.8h, v27.8h, v20.8h\n" + "zip2 v16.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" "zip1 v24.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v26.8h, v18.8h\n" + "zip1 v17.8h, v22.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "zip2 v23.8h, v23.8h, v19.8h\n" - "zip2 v19.8h, v26.8h, v18.8h\n" + "zip2 v19.8h, v22.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" "zip1 v22.8h, v25.8h, v21.8h\n" @@ -134,132 +134,131 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr d25, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d22, [x21], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" - "ld1 { v25.s }[2], [x28], #0x4\n" - "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" "mov x20, #0x6\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v20.s }[2], [x22], #0x4\n" - "ld1 { v22.s }[2], [x21], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[6], [x28]\n" - "ld1 { v30.h }[6], [x27]\n" + "ld1 { v30.h }[6], [x28]\n" + "ld1 { v29.h }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.h }[6], [x26]\n" - "ld1 { v28.h }[6], [x25]\n" - "ld1 { v21.h }[6], [x24]\n" - "ld1 { v27.h }[6], [x23]\n" - "ld1 { v20.h }[6], [x22]\n" - "ld1 { v22.h }[6], [x21]\n" + "ld1 { v28.h }[6], [x26]\n" + "ld1 { v27.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[4], [x28]\n" - "ld1 { v30.h }[4], [x27]\n" + "ld1 { v30.h }[4], [x28]\n" + "ld1 { v29.h }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.h }[4], [x26]\n" - "ld1 { v28.h }[4], [x25]\n" - "ld1 { v21.h }[4], [x24]\n" - "ld1 { v27.h }[4], [x23]\n" - "ld1 { v20.h }[4], [x22]\n" - "ld1 { v22.h }[4], [x21]\n" + "ld1 { v28.h }[4], [x26]\n" + "ld1 { v27.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr s25, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s22, [x21], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[2], [x28]\n" - "ld1 { v30.h }[2], [x27]\n" + "ld1 { v30.h }[2], [x28]\n" + "ld1 { v29.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v28.h }[2], [x25]\n" - "ld1 { v21.h }[2], [x24]\n" - "ld1 { v27.h }[2], [x23]\n" - "ld1 { v20.h }[2], [x22]\n" - "ld1 { v22.h }[2], [x21]\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v27.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr h25, [x28, #0x0]\n" - "ldr h30, [x27, #0x0]\n" + "ldr h30, [x28, #0x0]\n" + "ldr h29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h28, [x25, #0x0]\n" - "ldr h21, [x24, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "ldr h20, [x22, #0x0]\n" - "ldr h22, [x21, #0x0]\n" + "ldr h28, [x26, #0x0]\n" + "ldr h27, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v19.8h, v29.8h, v20.8h\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip1 v26.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v22.8h\n" - "zip1 v24.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v24.8h, v17.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v24.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v23.8h, v19.8h\n" - "zip2 v19.8h, v26.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v22.8h\n" - "zip1 v22.8h, v25.8h, v21.8h\n" - "zip1 v18.8h, v20.8h, v16.8h\n" - "zip1 v19.8h, v22.8h, v18.8h\n" - "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v18.8h, v22.8h, v18.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v25.8h, v21.8h\n" - "zip2 v20.8h, v20.8h, v16.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp index f64db0b476..3a5dcf4a6b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp @@ -79,36 +79,36 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr d28, [x28], #0x8\n" - "ldr d27, [x27], #0x8\n" - "fcvtl v28.4s, v28.4h\n" + "ldr d27, [x28], #0x8\n" + "ldr d26, [x27], #0x8\n" "fcvtl v27.4s, v27.4h\n" + "fcvtl v26.4s, v26.4h\n" "ldr d22, [x26], #0x8\n" "ldr d21, [x25], #0x8\n" "fcvtl v22.4s, v22.4h\n" "fcvtl v21.4s, v21.4h\n" - "ldr d26, [x24], #0x8\n" + "ldr d20, [x24], #0x8\n" "ldr d25, [x23], #0x8\n" - "fcvtl v26.4s, v26.4h\n" - "fcvtl v25.4s, v25.4h\n" - "ldr d20, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" "fcvtl v20.4s, v20.4h\n" + "fcvtl v25.4s, v25.4h\n" + "ldr d19, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" "fcvtl v19.4s, v19.4h\n" - "zip1 v24.4s, v28.4s, v22.4s\n" - "zip1 v23.4s, v27.4s, v21.4s\n" + "fcvtl v16.4s, v16.4h\n" + "zip1 v24.4s, v27.4s, v22.4s\n" + "zip1 v23.4s, v26.4s, v21.4s\n" "subs %x[width], %x[width], #0x4\n" "cmp %x[width], #0x4\n" - "zip1 v18.4s, v26.4s, v20.4s\n" - "zip1 v17.4s, v25.4s, v19.4s\n" + "zip1 v18.4s, v20.4s, v19.4s\n" + "zip1 v17.4s, v25.4s, v16.4s\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v22.4s, v27.4s, v22.4s\n" + "zip2 v21.4s, v26.4s, v21.4s\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v20.4s, v26.4s, v20.4s\n" - "zip2 v19.4s, v25.4s, v19.4s\n" + "zip2 v20.4s, v20.4s, v19.4s\n" + "zip2 v19.4s, v25.4s, v16.4s\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "prfm pldl1keep, [x22, #0x70]\n" @@ -137,71 +137,70 @@ void interleave_block<8, 1, VLType::None, false>( "ldr s28, [x28], #0x4\n" "ldr s27, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s22, [x26], #0x4\n" - "ldr s21, [x25], #0x4\n" - "ldr s26, [x24], #0x4\n" - "ldr s25, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s26, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" + "ldr s24, [x24], #0x4\n" + "ldr s23, [x23], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" "tbz %x[width], #0, 5f\n" "ld1 { v28.h }[2], [x28]\n" "ld1 { v27.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v22.h }[2], [x26]\n" - "ld1 { v21.h }[2], [x25]\n" - "ld1 { v26.h }[2], [x24]\n" - "ld1 { v25.h }[2], [x23]\n" - "ld1 { v20.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" + "ld1 { v26.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v24.h }[2], [x24]\n" + "ld1 { v23.h }[2], [x23]\n" + "ld1 { v22.h }[2], [x22]\n" + "ld1 { v21.h }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 "ldr h28, [x28, #0x0]\n" "ldr h27, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h22, [x26, #0x0]\n" - "ldr h21, [x25, #0x0]\n" - "ldr h26, [x24, #0x0]\n" - "ldr h25, [x23, #0x0]\n" - "ldr h20, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" + "ldr h26, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h24, [x24, #0x0]\n" + "ldr h23, [x23, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "ldr h21, [x21, #0x0]\n" "5:" // Odd load end "fcvtl v28.4s, v28.4h\n" "fcvtl v27.4s, v27.4h\n" "subs x20, x20, #0x1\n" - "fcvtl v22.4s, v22.4h\n" - "fcvtl v21.4s, v21.4h\n" "fcvtl v26.4s, v26.4h\n" "fcvtl v25.4s, v25.4h\n" - "fcvtl v20.4s, v20.4h\n" - "fcvtl v19.4s, v19.4h\n" - "zip1 v24.4s, v28.4s, v22.4s\n" - "zip1 v23.4s, v27.4s, v21.4s\n" - "zip1 v18.4s, v26.4s, v20.4s\n" - "zip1 v17.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v24.4s, v23.4s\n" + "fcvtl v24.4s, v24.4h\n" + "fcvtl v23.4s, v23.4h\n" + "fcvtl v22.4s, v22.4h\n" + "fcvtl v21.4s, v21.4h\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" "subs x20, x20, #0x1\n" - "zip2 v16.4s, v24.4s, v23.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v17.4s, v18.4s, v17.4s\n" - "str q17, [%x[out_ptr], #0x10]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "zip2 v20.4s, v26.4s, v20.4s\n" - "zip2 v19.4s, v25.4s, v19.4s\n" - "zip1 v16.4s, v22.4s, v21.4s\n" + "zip2 v19.4s, v28.4s, v26.4s\n" + "zip2 v16.4s, v27.4s, v25.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v16.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.4s, v20.4s, v19.4s\n" - "str q18, [%x[out_ptr], #0x10]\n" + "zip1 v16.4s, v18.4s, v17.4s\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp index 6c009b34b8..80c387db47 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp @@ -79,29 +79,29 @@ void interleave_block<8, 1, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x28], #0x10\n" - "ldr q27, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q18, [x27], #0x10\n" "subs %x[width], %x[width], #0x4\n" "cmp %x[width], #0x4\n" - "ldr q22, [x26], #0x10\n" - "ldr q21, [x25], #0x10\n" - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" - "ldr q24, [x24], #0x10\n" + "ldr q17, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v25.4s, v20.4s, v17.4s\n" + "zip1 v24.4s, v18.4s, v16.4s\n" + "ldr q19, [x24], #0x10\n" "ldr q23, [x23], #0x10\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "ldr q19, [x22], #0x10\n" - "ldr q18, [x21], #0x10\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" + "zip2 v22.4s, v20.4s, v17.4s\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q18, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v20.4s, v19.4s, v18.4s\n" + "zip1 v17.4s, v23.4s, v16.4s\n" + "zip2 v19.4s, v19.4s, v18.4s\n" + "zip2 v18.4s, v23.4s, v16.4s\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" @@ -109,7 +109,7 @@ void interleave_block<8, 1, VLType::None, false>( "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x20]\n" "zip2 v16.4s, v20.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x30]\n" @@ -129,63 +129,62 @@ void interleave_block<8, 1, VLType::None, false>( "ldr d28, [x28], #0x8\n" "ldr d27, [x27], #0x8\n" "mov x20, #0x2\n" - "ldr d22, [x26], #0x8\n" - "ldr d21, [x25], #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" "ldr d24, [x24], #0x8\n" "ldr d23, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" "tbz %x[width], #0, 5f\n" "ld1 { v28.s }[2], [x28]\n" "ld1 { v27.s }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v21.s }[2], [x25]\n" + "ld1 { v26.s }[2], [x26]\n" + "ld1 { v25.s }[2], [x25]\n" "ld1 { v24.s }[2], [x24]\n" "ld1 { v23.s }[2], [x23]\n" - "ld1 { v19.s }[2], [x22]\n" - "ld1 { v18.s }[2], [x21]\n" + "ld1 { v22.s }[2], [x22]\n" + "ld1 { v21.s }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 "ldr s28, [x28, #0x0]\n" "ldr s27, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr s22, [x26, #0x0]\n" - "ldr s21, [x25, #0x0]\n" + "ldr s26, [x26, #0x0]\n" + "ldr s25, [x25, #0x0]\n" "ldr s24, [x24, #0x0]\n" "ldr s23, [x23, #0x0]\n" - "ldr s19, [x22, #0x0]\n" - "ldr s18, [x21, #0x0]\n" + "ldr s22, [x22, #0x0]\n" + "ldr s21, [x21, #0x0]\n" "5:" // Odd load end - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" "subs x20, x20, #0x1\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v20.4s, v17.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" "subs x20, x20, #0x1\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v20.4s, v17.4s\n" + "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 6f\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v22.4s, v21.4s\n" + "zip2 v19.4s, v28.4s, v26.4s\n" + "zip2 v16.4s, v27.4s, v25.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v19.4s, v16.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "6:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp index 767d468ad1..8e06b7ecab 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp @@ -80,33 +80,33 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr q25, [x28], #0x10\n" - "ldr q30, [x27], #0x10\n" + "ldr q27, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" - "ldr q29, [x26], #0x10\n" - "ldr q28, [x25], #0x10\n" + "ldr q26, [x26], #0x10\n" + "ldr q24, [x25], #0x10\n" "ldr q21, [x24], #0x10\n" - "ldr q27, [x23], #0x10\n" + "ldr q20, [x23], #0x10\n" "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v26.8h, v30.8h, v27.8h\n" - "ldr q20, [x22], #0x10\n" - "ldr q22, [x21], #0x10\n" - "zip1 v19.8h, v29.8h, v20.8h\n" - "zip1 v18.8h, v28.8h, v22.8h\n" + "zip1 v22.8h, v27.8h, v20.8h\n" + "ldr q17, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v19.8h, v26.8h, v17.8h\n" + "zip1 v18.8h, v24.8h, v16.8h\n" "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v21.8h, v26.8h, v17.8h\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v22.8h\n" + "zip2 v20.8h, v27.8h, v20.8h\n" + "zip2 v16.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" "zip1 v24.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v26.8h, v18.8h\n" + "zip1 v17.8h, v22.8h, v18.8h\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "zip2 v23.8h, v23.8h, v19.8h\n" - "zip2 v19.8h, v26.8h, v18.8h\n" + "zip2 v19.8h, v22.8h, v18.8h\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" "zip1 v22.8h, v25.8h, v21.8h\n" @@ -134,132 +134,131 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr d25, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d22, [x21], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" - "ld1 { v25.s }[2], [x28], #0x4\n" - "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" "mov x20, #0x6\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v21.s }[2], [x24], #0x4\n" - "ld1 { v27.s }[2], [x23], #0x4\n" - "ld1 { v20.s }[2], [x22], #0x4\n" - "ld1 { v22.s }[2], [x21], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[6], [x28]\n" - "ld1 { v30.h }[6], [x27]\n" + "ld1 { v30.h }[6], [x28]\n" + "ld1 { v29.h }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.h }[6], [x26]\n" - "ld1 { v28.h }[6], [x25]\n" - "ld1 { v21.h }[6], [x24]\n" - "ld1 { v27.h }[6], [x23]\n" - "ld1 { v20.h }[6], [x22]\n" - "ld1 { v22.h }[6], [x21]\n" + "ld1 { v28.h }[6], [x26]\n" + "ld1 { v27.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "ld1 { v23.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[4], [x28]\n" - "ld1 { v30.h }[4], [x27]\n" + "ld1 { v30.h }[4], [x28]\n" + "ld1 { v29.h }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.h }[4], [x26]\n" - "ld1 { v28.h }[4], [x25]\n" - "ld1 { v21.h }[4], [x24]\n" - "ld1 { v27.h }[4], [x23]\n" - "ld1 { v20.h }[4], [x22]\n" - "ld1 { v22.h }[4], [x21]\n" + "ld1 { v28.h }[4], [x26]\n" + "ld1 { v27.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "ld1 { v23.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr s25, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s22, [x21], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.h }[2], [x28]\n" - "ld1 { v30.h }[2], [x27]\n" + "ld1 { v30.h }[2], [x28]\n" + "ld1 { v29.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v28.h }[2], [x25]\n" - "ld1 { v21.h }[2], [x24]\n" - "ld1 { v27.h }[2], [x23]\n" - "ld1 { v20.h }[2], [x22]\n" - "ld1 { v22.h }[2], [x21]\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v27.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "ld1 { v23.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr h25, [x28, #0x0]\n" - "ldr h30, [x27, #0x0]\n" + "ldr h30, [x28, #0x0]\n" + "ldr h29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h28, [x25, #0x0]\n" - "ldr h21, [x24, #0x0]\n" - "ldr h27, [x23, #0x0]\n" - "ldr h20, [x22, #0x0]\n" - "ldr h22, [x21, #0x0]\n" + "ldr h28, [x26, #0x0]\n" + "ldr h27, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "ldr h23, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v19.8h, v29.8h, v20.8h\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip1 v26.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v22.8h\n" - "zip1 v24.8h, v23.8h, v19.8h\n" - "zip1 v17.8h, v26.8h, v18.8h\n" - "zip1 v16.8h, v24.8h, v17.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v24.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v23.8h, v19.8h\n" - "zip2 v19.8h, v26.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v22.8h\n" - "zip1 v22.8h, v25.8h, v21.8h\n" - "zip1 v18.8h, v20.8h, v16.8h\n" - "zip1 v19.8h, v22.8h, v18.8h\n" - "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v18.8h, v22.8h, v18.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v25.8h, v21.8h\n" - "zip2 v20.8h, v20.8h, v16.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp index a73792036a..b91ae8a948 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp @@ -159,101 +159,101 @@ void interleave_block<8, 1, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" - "ldr d31, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" "ldr d24, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 6f\n" - "ld1 { v31.s }[2], [x28], #0x4\n" - "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" "mov x20, #0x6\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" "ld1 { v24.s }[2], [x22], #0x4\n" "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[6], [x28]\n" - "ld1 { v30.h }[6], [x27]\n" + "ld1 { v30.h }[6], [x28]\n" + "ld1 { v29.h }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.h }[6], [x26]\n" - "ld1 { v28.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x24]\n" - "ld1 { v26.h }[6], [x23]\n" + "ld1 { v28.h }[6], [x26]\n" + "ld1 { v27.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" "ld1 { v24.h }[6], [x22]\n" "ld1 { v23.h }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[4], [x28]\n" - "ld1 { v30.h }[4], [x27]\n" + "ld1 { v30.h }[4], [x28]\n" + "ld1 { v29.h }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.h }[4], [x26]\n" - "ld1 { v28.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x24]\n" - "ld1 { v26.h }[4], [x23]\n" + "ld1 { v28.h }[4], [x26]\n" + "ld1 { v27.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" "ld1 { v24.h }[4], [x22]\n" "ld1 { v23.h }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" - "ldr s31, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" "ldr s24, [x22], #0x4\n" "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[2], [x28]\n" - "ld1 { v30.h }[2], [x27]\n" + "ld1 { v30.h }[2], [x28]\n" + "ld1 { v29.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v28.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x24]\n" - "ld1 { v26.h }[2], [x23]\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v27.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" "ld1 { v24.h }[2], [x22]\n" "ld1 { v23.h }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 - "ldr h31, [x28, #0x0]\n" - "ldr h30, [x27, #0x0]\n" + "ldr h30, [x28, #0x0]\n" + "ldr h29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h28, [x25, #0x0]\n" - "ldr h27, [x24, #0x0]\n" - "ldr h26, [x23, #0x0]\n" + "ldr h28, [x26, #0x0]\n" + "ldr h27, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" "ldr h24, [x22, #0x0]\n" "ldr h23, [x21, #0x0]\n" "9:" // Odd load end - "zip1 v25.8h, v31.8h, v27.8h\n" - "zip1 v18.8h, v29.8h, v24.8h\n" - "subs x20, x20, #0x1\n" "zip1 v22.8h, v30.8h, v26.8h\n" - "zip1 v21.8h, v28.8h, v23.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v21.8h\n" - "zip1 v20.8h, v17.8h, v16.8h\n" - "str q20, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v20.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v17.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "subs x20, x20, #0x1\n" - "str q19, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v19.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v25.8h, v18.8h\n" - "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -266,11 +266,11 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v22.8h, v31.8h, v27.8h\n" - "zip2 v21.8h, v29.8h, v24.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v26.8h\n" - "zip2 v19.8h, v28.8h, v23.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" "zip1 v18.8h, v22.8h, v21.8h\n" "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" @@ -284,9 +284,9 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v22.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp index 4a38187638..c41120c698 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp @@ -80,35 +80,35 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr d25, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" + "ldr d27, [x27], #0x8\n" "sshll v25.8h, v25.8b, #0x0\n" - "sshll v30.8h, v30.8b, #0x0\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "sshll v29.8h, v29.8b, #0x0\n" - "sshll v28.8h, v28.8b, #0x0\n" + "sshll v27.8h, v27.8b, #0x0\n" + "ldr d26, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "sshll v26.8h, v26.8b, #0x0\n" + "sshll v24.8h, v24.8b, #0x0\n" "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" + "ldr d20, [x23], #0x8\n" "sshll v21.8h, v21.8b, #0x0\n" - "sshll v27.8h, v27.8b, #0x0\n" - "ldr d20, [x22], #0x8\n" - "ldr d26, [x21], #0x8\n" "sshll v20.8h, v20.8b, #0x0\n" - "sshll v26.8h, v26.8b, #0x0\n" + "ldr d17, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" + "sshll v17.8h, v17.8b, #0x0\n" + "sshll v16.8h, v16.8b, #0x0\n" "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v22.8h, v29.8h, v20.8h\n" + "zip1 v22.8h, v26.8h, v17.8h\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" - "zip1 v19.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v19.8h, v27.8h, v20.8h\n" + "zip1 v18.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v21.8h, v26.8h, v17.8h\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v26.8h\n" + "zip2 v20.8h, v27.8h, v20.8h\n" + "zip2 v16.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "zip1 v24.8h, v23.8h, v22.8h\n" @@ -142,140 +142,139 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s25, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v25.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" "mov x20, #0x6\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "ld1 { v20.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "ld1 { v23.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v27.b }[6], [x23]\n" - "ld1 { v20.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v24.b }[6], [x22]\n" + "ld1 { v23.b }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v27.b }[4], [x23]\n" - "ld1 { v20.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v24.b }[4], [x22]\n" + "ld1 { v23.b }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h25, [x28], #0x2\n" - "ldr h30, [x27], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" "mov x20, #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h27, [x23], #0x2\n" - "ldr h20, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h26, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h24, [x22], #0x2\n" + "ldr h23, [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v27.b }[2], [x23]\n" - "ld1 { v20.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v24.b }[2], [x22]\n" + "ld1 { v23.b }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b25, [x28, #0x0]\n" - "ldr b30, [x27, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b27, [x23, #0x0]\n" - "ldr b20, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b26, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b24, [x22, #0x0]\n" + "ldr b23, [x21, #0x0]\n" "7:" // Odd load end - "sshll v25.8h, v25.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" - "subs x20, x20, #0x1\n" "sshll v29.8h, v29.8b, #0x0\n" + "subs x20, x20, #0x1\n" "sshll v28.8h, v28.8b, #0x0\n" - "sshll v21.8h, v21.8b, #0x0\n" "sshll v27.8h, v27.8b, #0x0\n" - "sshll v20.8h, v20.8b, #0x0\n" "sshll v26.8h, v26.8b, #0x0\n" - "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v22.8h, v29.8h, v20.8h\n" - "zip1 v19.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v24.8h, v23.8h, v22.8h\n" - "zip1 v17.8h, v19.8h, v18.8h\n" - "zip1 v16.8h, v24.8h, v17.8h\n" + "sshll v25.8h, v25.8b, #0x0\n" + "sshll v24.8h, v24.8b, #0x0\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v24.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v23.8h, v22.8h\n" - "zip2 v19.8h, v19.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v26.8h\n" - "zip1 v22.8h, v25.8h, v21.8h\n" - "zip1 v18.8h, v20.8h, v16.8h\n" - "zip1 v19.8h, v22.8h, v18.8h\n" - "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v18.8h, v22.8h, v18.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v25.8h, v21.8h\n" - "zip2 v20.8h, v20.8h, v16.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp index 3ad103c8d4..9ac7053ad8 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp @@ -167,109 +167,109 @@ void interleave_block<8, 1, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" - "ldr s31, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v31.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" "mov x20, #0x6\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" - "ld1 { v24.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "ld1 { v23.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" - "ld1 { v24.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v24.b }[6], [x22]\n" + "ld1 { v23.b }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" - "ld1 { v24.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v24.b }[4], [x22]\n" + "ld1 { v23.b }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" - "ldr h31, [x28], #0x2\n" - "ldr h30, [x27], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" "mov x20, #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h27, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h25, [x22], #0x2\n" - "ldr h24, [x21], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h26, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h24, [x22], #0x2\n" + "ldr h23, [x21], #0x2\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" - "ld1 { v24.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v24.b }[2], [x22]\n" + "ld1 { v23.b }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 - "ldr b31, [x28, #0x0]\n" - "ldr b30, [x27, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b27, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b25, [x22, #0x0]\n" - "ldr b24, [x21, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b26, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b24, [x22, #0x0]\n" + "ldr b23, [x21, #0x0]\n" "9:" // Odd load end - "sshll v31.8h, v31.8b, #0x0\n" "sshll v30.8h, v30.8b, #0x0\n" - "subs x20, x20, #0x1\n" "sshll v29.8h, v29.8b, #0x0\n" + "subs x20, x20, #0x1\n" "sshll v28.8h, v28.8b, #0x0\n" "sshll v27.8h, v27.8b, #0x0\n" "sshll v26.8h, v26.8b, #0x0\n" "sshll v25.8h, v25.8b, #0x0\n" "sshll v24.8h, v24.8b, #0x0\n" - "zip1 v23.8h, v31.8h, v27.8h\n" - "zip1 v22.8h, v29.8h, v25.8h\n" - "zip1 v21.8h, v30.8h, v26.8h\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v23.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" + "sshll v23.8h, v23.8b, #0x0\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "subs x20, x20, #0x1\n" - "str q19, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v19.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v23.8h, v22.8h\n" - "zip2 v17.8h, v21.8h, v20.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -282,11 +282,11 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v22.8h, v31.8h, v27.8h\n" - "zip2 v21.8h, v29.8h, v25.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v26.8h\n" - "zip2 v19.8h, v28.8h, v24.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" "zip1 v18.8h, v22.8h, v21.8h\n" "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" @@ -300,9 +300,9 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v22.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp index de29d77a22..c01d980f49 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp @@ -159,101 +159,101 @@ void interleave_block<8, 1, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" - "ldr d31, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d29, [x27], #0x8\n" + "ldr d28, [x26], #0x8\n" + "ldr d27, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" "ldr d24, [x22], #0x8\n" "ldr d23, [x21], #0x8\n" "tbz %x[width], #1, 6f\n" - "ld1 { v31.s }[2], [x28], #0x4\n" - "ld1 { v30.s }[2], [x27], #0x4\n" + "ld1 { v30.s }[2], [x28], #0x4\n" + "ld1 { v29.s }[2], [x27], #0x4\n" "mov x20, #0x6\n" - "ld1 { v29.s }[2], [x26], #0x4\n" - "ld1 { v28.s }[2], [x25], #0x4\n" - "ld1 { v27.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v28.s }[2], [x26], #0x4\n" + "ld1 { v27.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" "ld1 { v24.s }[2], [x22], #0x4\n" "ld1 { v23.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[6], [x28]\n" - "ld1 { v30.h }[6], [x27]\n" + "ld1 { v30.h }[6], [x28]\n" + "ld1 { v29.h }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.h }[6], [x26]\n" - "ld1 { v28.h }[6], [x25]\n" - "ld1 { v27.h }[6], [x24]\n" - "ld1 { v26.h }[6], [x23]\n" + "ld1 { v28.h }[6], [x26]\n" + "ld1 { v27.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" "ld1 { v24.h }[6], [x22]\n" "ld1 { v23.h }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[4], [x28]\n" - "ld1 { v30.h }[4], [x27]\n" + "ld1 { v30.h }[4], [x28]\n" + "ld1 { v29.h }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.h }[4], [x26]\n" - "ld1 { v28.h }[4], [x25]\n" - "ld1 { v27.h }[4], [x24]\n" - "ld1 { v26.h }[4], [x23]\n" + "ld1 { v28.h }[4], [x26]\n" + "ld1 { v27.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" "ld1 { v24.h }[4], [x22]\n" "ld1 { v23.h }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" - "ldr s31, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" "mov x20, #0x2\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" "ldr s24, [x22], #0x4\n" "ldr s23, [x21], #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.h }[2], [x28]\n" - "ld1 { v30.h }[2], [x27]\n" + "ld1 { v30.h }[2], [x28]\n" + "ld1 { v29.h }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.h }[2], [x26]\n" - "ld1 { v28.h }[2], [x25]\n" - "ld1 { v27.h }[2], [x24]\n" - "ld1 { v26.h }[2], [x23]\n" + "ld1 { v28.h }[2], [x26]\n" + "ld1 { v27.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" "ld1 { v24.h }[2], [x22]\n" "ld1 { v23.h }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 - "ldr h31, [x28, #0x0]\n" - "ldr h30, [x27, #0x0]\n" + "ldr h30, [x28, #0x0]\n" + "ldr h29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h29, [x26, #0x0]\n" - "ldr h28, [x25, #0x0]\n" - "ldr h27, [x24, #0x0]\n" - "ldr h26, [x23, #0x0]\n" + "ldr h28, [x26, #0x0]\n" + "ldr h27, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" "ldr h24, [x22, #0x0]\n" "ldr h23, [x21, #0x0]\n" "9:" // Odd load end - "zip1 v25.8h, v31.8h, v27.8h\n" - "zip1 v18.8h, v29.8h, v24.8h\n" - "subs x20, x20, #0x1\n" "zip1 v22.8h, v30.8h, v26.8h\n" - "zip1 v21.8h, v28.8h, v23.8h\n" - "zip1 v17.8h, v25.8h, v18.8h\n" - "zip1 v16.8h, v22.8h, v21.8h\n" - "zip1 v20.8h, v17.8h, v16.8h\n" - "str q20, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v20.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "subs x20, x20, #0x1\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v17.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "subs x20, x20, #0x1\n" - "str q19, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v19.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v25.8h, v18.8h\n" - "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -266,11 +266,11 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v22.8h, v31.8h, v27.8h\n" - "zip2 v21.8h, v29.8h, v24.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v26.8h\n" - "zip2 v19.8h, v28.8h, v23.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" "zip1 v18.8h, v22.8h, v21.8h\n" "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" @@ -284,9 +284,9 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v22.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp index 43a3a46801..d29a995b46 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp @@ -80,35 +80,35 @@ void interleave_block<8, 1, VLType::None, false>( "blt 3f\n" "2:" // Main loop head "ldr d25, [x28], #0x8\n" - "ldr d30, [x27], #0x8\n" + "ldr d27, [x27], #0x8\n" "ushll v25.8h, v25.8b, #0x0\n" - "ushll v30.8h, v30.8b, #0x0\n" - "ldr d29, [x26], #0x8\n" - "ldr d28, [x25], #0x8\n" - "ushll v29.8h, v29.8b, #0x0\n" - "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ldr d26, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" "ldr d21, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" + "ldr d20, [x23], #0x8\n" "ushll v21.8h, v21.8b, #0x0\n" - "ushll v27.8h, v27.8b, #0x0\n" - "ldr d20, [x22], #0x8\n" - "ldr d26, [x21], #0x8\n" "ushll v20.8h, v20.8b, #0x0\n" - "ushll v26.8h, v26.8b, #0x0\n" + "ldr d17, [x22], #0x8\n" + "ldr d16, [x21], #0x8\n" + "ushll v17.8h, v17.8b, #0x0\n" + "ushll v16.8h, v16.8b, #0x0\n" "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v22.8h, v29.8h, v20.8h\n" + "zip1 v22.8h, v26.8h, v17.8h\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" - "zip1 v19.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v26.8h\n" + "zip1 v19.8h, v27.8h, v20.8h\n" + "zip1 v18.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v21.8h, v26.8h, v17.8h\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v26.8h\n" + "zip2 v20.8h, v27.8h, v20.8h\n" + "zip2 v16.8h, v24.8h, v16.8h\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" "zip1 v24.8h, v23.8h, v22.8h\n" @@ -142,140 +142,139 @@ void interleave_block<8, 1, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr s25, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s21, [x24], #0x4\n" - "ldr s27, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s26, [x21], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v25.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" "mov x20, #0x6\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v21.h }[2], [x24], #0x2\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "ld1 { v20.h }[2], [x22], #0x2\n" - "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "ld1 { v23.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v21.b }[6], [x24]\n" - "ld1 { v27.b }[6], [x23]\n" - "ld1 { v20.b }[6], [x22]\n" - "ld1 { v26.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v24.b }[6], [x22]\n" + "ld1 { v23.b }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v21.b }[4], [x24]\n" - "ld1 { v27.b }[4], [x23]\n" - "ld1 { v20.b }[4], [x22]\n" - "ld1 { v26.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v24.b }[4], [x22]\n" + "ld1 { v23.b }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr h25, [x28], #0x2\n" - "ldr h30, [x27], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" "mov x20, #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h21, [x24], #0x2\n" - "ldr h27, [x23], #0x2\n" - "ldr h20, [x22], #0x2\n" - "ldr h26, [x21], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h26, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h24, [x22], #0x2\n" + "ldr h23, [x21], #0x2\n" "tbz %x[width], #0, 7f\n" - "ld1 { v25.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v21.b }[2], [x24]\n" - "ld1 { v27.b }[2], [x23]\n" - "ld1 { v20.b }[2], [x22]\n" - "ld1 { v26.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v24.b }[2], [x22]\n" + "ld1 { v23.b }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr b25, [x28, #0x0]\n" - "ldr b30, [x27, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b21, [x24, #0x0]\n" - "ldr b27, [x23, #0x0]\n" - "ldr b20, [x22, #0x0]\n" - "ldr b26, [x21, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b26, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b24, [x22, #0x0]\n" + "ldr b23, [x21, #0x0]\n" "7:" // Odd load end - "ushll v25.8h, v25.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" - "subs x20, x20, #0x1\n" "ushll v29.8h, v29.8b, #0x0\n" + "subs x20, x20, #0x1\n" "ushll v28.8h, v28.8b, #0x0\n" - "ushll v21.8h, v21.8b, #0x0\n" "ushll v27.8h, v27.8b, #0x0\n" - "ushll v20.8h, v20.8b, #0x0\n" "ushll v26.8h, v26.8b, #0x0\n" - "zip1 v23.8h, v25.8h, v21.8h\n" - "zip1 v22.8h, v29.8h, v20.8h\n" - "zip1 v19.8h, v30.8h, v27.8h\n" - "zip1 v18.8h, v28.8h, v26.8h\n" - "zip1 v24.8h, v23.8h, v22.8h\n" - "zip1 v17.8h, v19.8h, v18.8h\n" - "zip1 v16.8h, v24.8h, v17.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v24.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v23.8h, v23.8h, v22.8h\n" - "zip2 v19.8h, v19.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" - "zip1 v17.8h, v23.8h, v19.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.8h, v23.8h, v19.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v25.8h, v25.8h, v21.8h\n" - "zip2 v21.8h, v29.8h, v20.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v27.8h\n" - "zip2 v16.8h, v28.8h, v26.8h\n" - "zip1 v22.8h, v25.8h, v21.8h\n" - "zip1 v18.8h, v20.8h, v16.8h\n" - "zip1 v19.8h, v22.8h, v18.8h\n" - "str q19, [%x[out_ptr], #0x0]\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v18.8h, v22.8h, v18.8h\n" - "str q18, [%x[out_ptr], #0x0]\n" + "zip2 v16.8h, v18.8h, v17.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 8f\n" - "zip2 v21.8h, v25.8h, v21.8h\n" - "zip2 v20.8h, v20.8h, v16.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" - "str q17, [%x[out_ptr], #0x0]\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp index 3ab24365af..ae4bf9bf3b 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp @@ -167,109 +167,109 @@ void interleave_block<8, 1, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 10f\n" "tbz %x[width], #2, 7f\n" - "ldr s31, [x28], #0x4\n" - "ldr s30, [x27], #0x4\n" - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "ldr s27, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s25, [x22], #0x4\n" - "ldr s24, [x21], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr s29, [x27], #0x4\n" + "ldr s28, [x26], #0x4\n" + "ldr s27, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "ldr s23, [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v31.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "ld1 { v29.h }[2], [x27], #0x2\n" "mov x20, #0x6\n" - "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x25], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" - "ld1 { v24.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x24], #0x2\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "ld1 { v23.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x28]\n" + "ld1 { v29.b }[6], [x27]\n" "mov x20, #0x7\n" - "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x25]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" - "ld1 { v24.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x24]\n" + "ld1 { v25.b }[6], [x23]\n" + "ld1 { v24.b }[6], [x22]\n" + "ld1 { v23.b }[6], [x21]\n" "b 9f\n" "6:" // odd_loads_1_4 "mov x20, #0x4\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x28]\n" + "ld1 { v29.b }[4], [x27]\n" "mov x20, #0x5\n" - "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x25]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" - "ld1 { v24.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x24]\n" + "ld1 { v25.b }[4], [x23]\n" + "ld1 { v24.b }[4], [x22]\n" + "ld1 { v23.b }[4], [x21]\n" "b 9f\n" "7:" // odd_loads_2_0 "tbz %x[width], #1, 8f\n" - "ldr h31, [x28], #0x2\n" - "ldr h30, [x27], #0x2\n" + "ldr h30, [x28], #0x2\n" + "ldr h29, [x27], #0x2\n" "mov x20, #0x2\n" - "ldr h29, [x26], #0x2\n" - "ldr h28, [x25], #0x2\n" - "ldr h27, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h25, [x22], #0x2\n" - "ldr h24, [x21], #0x2\n" + "ldr h28, [x26], #0x2\n" + "ldr h27, [x25], #0x2\n" + "ldr h26, [x24], #0x2\n" + "ldr h25, [x23], #0x2\n" + "ldr h24, [x22], #0x2\n" + "ldr h23, [x21], #0x2\n" "tbz %x[width], #0, 9f\n" - "ld1 { v31.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x28]\n" + "ld1 { v29.b }[2], [x27]\n" "mov x20, #0x3\n" - "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x25]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" - "ld1 { v24.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x24]\n" + "ld1 { v25.b }[2], [x23]\n" + "ld1 { v24.b }[2], [x22]\n" + "ld1 { v23.b }[2], [x21]\n" "b 9f\n" "8:" // odd_loads_1_0 - "ldr b31, [x28, #0x0]\n" - "ldr b30, [x27, #0x0]\n" + "ldr b30, [x28, #0x0]\n" + "ldr b29, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b29, [x26, #0x0]\n" - "ldr b28, [x25, #0x0]\n" - "ldr b27, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b25, [x22, #0x0]\n" - "ldr b24, [x21, #0x0]\n" + "ldr b28, [x26, #0x0]\n" + "ldr b27, [x25, #0x0]\n" + "ldr b26, [x24, #0x0]\n" + "ldr b25, [x23, #0x0]\n" + "ldr b24, [x22, #0x0]\n" + "ldr b23, [x21, #0x0]\n" "9:" // Odd load end - "ushll v31.8h, v31.8b, #0x0\n" "ushll v30.8h, v30.8b, #0x0\n" - "subs x20, x20, #0x1\n" "ushll v29.8h, v29.8b, #0x0\n" + "subs x20, x20, #0x1\n" "ushll v28.8h, v28.8b, #0x0\n" "ushll v27.8h, v27.8b, #0x0\n" "ushll v26.8h, v26.8b, #0x0\n" "ushll v25.8h, v25.8b, #0x0\n" "ushll v24.8h, v24.8b, #0x0\n" - "zip1 v23.8h, v31.8h, v27.8h\n" - "zip1 v22.8h, v29.8h, v25.8h\n" - "zip1 v21.8h, v30.8h, v26.8h\n" - "zip1 v20.8h, v28.8h, v24.8h\n" - "zip1 v18.8h, v23.8h, v22.8h\n" - "zip1 v17.8h, v21.8h, v20.8h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "zip1 v22.8h, v30.8h, v26.8h\n" + "zip1 v21.8h, v28.8h, v24.8h\n" + "zip1 v20.8h, v29.8h, v25.8h\n" + "zip1 v19.8h, v27.8h, v23.8h\n" + "zip1 v18.8h, v22.8h, v21.8h\n" + "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v19.8h, v18.8h, v17.8h\n" + "zip2 v16.8h, v18.8h, v17.8h\n" "subs x20, x20, #0x1\n" - "str q19, [%x[out_ptr], #0x0]\n" - "add v2.8h, v2.8h, v19.8h\n" + "str q16, [%x[out_ptr], #0x0]\n" + "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v23.8h, v22.8h\n" - "zip2 v17.8h, v21.8h, v20.8h\n" + "zip2 v18.8h, v22.8h, v21.8h\n" + "zip2 v17.8h, v20.8h, v19.8h\n" "subs x20, x20, #0x1\n" "zip1 v16.8h, v18.8h, v17.8h\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -282,11 +282,11 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v22.8h, v31.8h, v27.8h\n" - "zip2 v21.8h, v29.8h, v25.8h\n" + "zip2 v22.8h, v30.8h, v26.8h\n" + "zip2 v21.8h, v28.8h, v24.8h\n" "subs x20, x20, #0x1\n" - "zip2 v20.8h, v30.8h, v26.8h\n" - "zip2 v19.8h, v28.8h, v24.8h\n" + "zip2 v20.8h, v29.8h, v25.8h\n" + "zip2 v19.8h, v27.8h, v23.8h\n" "zip1 v18.8h, v22.8h, v21.8h\n" "zip1 v17.8h, v20.8h, v19.8h\n" "zip1 v16.8h, v18.8h, v17.8h\n" @@ -300,9 +300,9 @@ void interleave_block<8, 1, VLType::None, true>( "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" "beq 10f\n" - "zip2 v18.8h, v22.8h, v21.8h\n" - "zip2 v17.8h, v20.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v17.8h\n" + "zip2 v17.8h, v22.8h, v21.8h\n" + "zip2 v16.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" "str q16, [%x[out_ptr], #0x0]\n" "add v2.8h, v2.8h, v16.8h\n" "add %x[out_ptr], %x[out_ptr], #0x10\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp index d4d150456f..43d9d20c10 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp @@ -79,29 +79,29 @@ void interleave_block<8, 2, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x28], #0x10\n" - "ldr q27, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q18, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" - "ldr q22, [x26], #0x10\n" - "ldr q21, [x25], #0x10\n" - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" - "ldr q24, [x24], #0x10\n" + "ldr q17, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v25.4s, v20.4s, v17.4s\n" + "zip1 v24.4s, v18.4s, v16.4s\n" + "ldr q19, [x24], #0x10\n" "ldr q23, [x23], #0x10\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "ldr q19, [x22], #0x10\n" - "ldr q18, [x21], #0x10\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" + "zip2 v22.4s, v20.4s, v17.4s\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q18, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v20.4s, v19.4s, v18.4s\n" + "zip1 v17.4s, v23.4s, v16.4s\n" + "zip2 v19.4s, v19.4s, v18.4s\n" + "zip2 v18.4s, v23.4s, v16.4s\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" @@ -109,7 +109,7 @@ void interleave_block<8, 2, VLType::None, false>( "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x20]\n" "zip2 v16.4s, v20.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x30]\n" @@ -128,32 +128,32 @@ void interleave_block<8, 2, VLType::None, false>( "tbz %x[width], #2, 5f\n" "ldr d28, [x28], #0x8\n" "ldr d27, [x27], #0x8\n" - "ldr d22, [x26], #0x8\n" - "ldr d21, [x25], #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" "ldr d24, [x24], #0x8\n" "ldr d23, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" "ld1 { v28.s }[2], [x28], #0x4\n" "ld1 { v27.s }[2], [x27], #0x4\n" "mov x20, #0x3\n" - "ld1 { v22.s }[2], [x26], #0x4\n" - "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v24.s }[2], [x24], #0x4\n" "ld1 { v23.s }[2], [x23], #0x4\n" - "ld1 { v19.s }[2], [x22], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "ld1 { v21.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v28.h }[6], [x28]\n" "ld1 { v27.h }[6], [x27]\n" "mov x20, #0x4\n" - "ld1 { v22.h }[6], [x26]\n" - "ld1 { v21.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" "ld1 { v24.h }[6], [x24]\n" "ld1 { v23.h }[6], [x23]\n" - "ld1 { v19.h }[6], [x22]\n" - "ld1 { v18.h }[6], [x21]\n" + "ld1 { v22.h }[6], [x22]\n" + "ld1 { v21.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x2\n" @@ -161,82 +161,81 @@ void interleave_block<8, 2, VLType::None, false>( "ld1 { v28.h }[4], [x28]\n" "ld1 { v27.h }[4], [x27]\n" "mov x20, #0x3\n" - "ld1 { v22.h }[4], [x26]\n" - "ld1 { v21.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" "ld1 { v24.h }[4], [x24]\n" "ld1 { v23.h }[4], [x23]\n" - "ld1 { v19.h }[4], [x22]\n" - "ld1 { v18.h }[4], [x21]\n" + "ld1 { v22.h }[4], [x22]\n" + "ld1 { v21.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" "ldr s28, [x28], #0x4\n" "ldr s27, [x27], #0x4\n" "mov x20, #0x1\n" - "ldr s22, [x26], #0x4\n" - "ldr s21, [x25], #0x4\n" + "ldr s26, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" "ldr s24, [x24], #0x4\n" "ldr s23, [x23], #0x4\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" "ld1 { v28.h }[2], [x28]\n" "ld1 { v27.h }[2], [x27]\n" "mov x20, #0x2\n" - "ld1 { v22.h }[2], [x26]\n" - "ld1 { v21.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" "ld1 { v24.h }[2], [x24]\n" "ld1 { v23.h }[2], [x23]\n" - "ld1 { v19.h }[2], [x22]\n" - "ld1 { v18.h }[2], [x21]\n" + "ld1 { v22.h }[2], [x22]\n" + "ld1 { v21.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 "ldr h28, [x28, #0x0]\n" "ldr h27, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h22, [x26, #0x0]\n" - "ldr h21, [x25, #0x0]\n" + "ldr h26, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" "ldr h24, [x24, #0x0]\n" "ldr h23, [x23, #0x0]\n" - "ldr h19, [x22, #0x0]\n" - "ldr h18, [x21, #0x0]\n" + "ldr h22, [x22, #0x0]\n" + "ldr h21, [x21, #0x0]\n" "7:" // Odd load end - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" "subs x20, x20, #0x1\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v20.4s, v17.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" "subs x20, x20, #0x1\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v20.4s, v17.4s\n" + "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v20.4s, v28.4s, v26.4s\n" + "zip2 v19.4s, v27.4s, v25.4s\n" "subs x20, x20, #0x1\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v22.4s, v21.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 8f\n" - "zip2 v17.4s, v22.4s, v21.4s\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp index 358b83ad1b..3ec03370a0 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp @@ -79,18 +79,18 @@ void interleave_block<8, 2, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q26, [x28], #0x10\n" - "ldr q21, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q19, [x27], #0x10\n" "subs %x[width], %x[width], #0x4\n" "cmp %x[width], #0x4\n" "ldr q25, [x26], #0x10\n" "ldr q24, [x25], #0x10\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" "zip1 v18.2d, v25.2d, v24.2d\n" "ldr q23, [x24], #0x10\n" "ldr q22, [x23], #0x10\n" "zip1 v17.2d, v23.2d, v22.2d\n" - "zip2 v21.2d, v26.2d, v21.2d\n" + "zip2 v21.2d, v20.2d, v19.2d\n" "ldr q20, [x22], #0x10\n" "ldr q19, [x21], #0x10\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -118,62 +118,61 @@ void interleave_block<8, 2, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr d26, [x28], #0x8\n" - "ldr d21, [x27], #0x8\n" + "ldr d25, [x28], #0x8\n" + "ldr d24, [x27], #0x8\n" "mov x20, #0x1\n" - "ldr d25, [x26], #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #0, 5f\n" - "ld1 { v26.s }[2], [x28]\n" - "ld1 { v21.s }[2], [x27]\n" + "ld1 { v25.s }[2], [x28]\n" + "ld1 { v24.s }[2], [x27]\n" "mov x20, #0x2\n" - "ld1 { v25.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v23.s }[2], [x24]\n" - "ld1 { v22.s }[2], [x23]\n" - "ld1 { v20.s }[2], [x22]\n" - "ld1 { v19.s }[2], [x21]\n" + "ld1 { v23.s }[2], [x26]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v20.s }[2], [x23]\n" + "ld1 { v19.s }[2], [x22]\n" + "ld1 { v18.s }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr s26, [x28, #0x0]\n" - "ldr s21, [x27, #0x0]\n" + "ldr s25, [x28, #0x0]\n" + "ldr s24, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr s25, [x26, #0x0]\n" - "ldr s24, [x25, #0x0]\n" - "ldr s23, [x24, #0x0]\n" - "ldr s22, [x23, #0x0]\n" - "ldr s20, [x22, #0x0]\n" - "ldr s19, [x21, #0x0]\n" + "ldr s23, [x26, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s20, [x23, #0x0]\n" + "ldr s19, [x22, #0x0]\n" + "ldr s18, [x21, #0x0]\n" "5:" // Odd load end "subs x20, x20, #0x1\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v25.2d, v24.2d\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v23.2d, v22.2d\n" - "zip1 v16.2d, v20.2d, v19.2d\n" + "zip1 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v21.2d, v20.2d\n" + "zip1 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 6f\n" - "zip2 v21.2d, v26.2d, v21.2d\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip2 v17.2d, v23.2d, v22.2d\n" - "zip2 v16.2d, v20.2d, v19.2d\n" + "zip2 v16.2d, v25.2d, v24.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v21.2d, v20.2d\n" + "zip2 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "6:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp index d606d5a5b6..e9799f87a9 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp @@ -79,18 +79,18 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q26, [x28], #0x10\n" - "ldr q21, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q19, [x27], #0x10\n" "subs %x[width], %x[width], #0x8\n" "cmp %x[width], #0x8\n" "ldr q25, [x26], #0x10\n" "ldr q24, [x25], #0x10\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" "zip1 v18.2d, v25.2d, v24.2d\n" "ldr q23, [x24], #0x10\n" "ldr q22, [x23], #0x10\n" "zip1 v17.2d, v23.2d, v22.2d\n" - "zip2 v21.2d, v26.2d, v21.2d\n" + "zip2 v21.2d, v20.2d, v19.2d\n" "ldr q20, [x22], #0x10\n" "ldr q19, [x21], #0x10\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -118,104 +118,103 @@ void interleave_block<8, 4, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 8f\n" "tbz %x[width], #2, 5f\n" - "ldr d26, [x28], #0x8\n" - "ldr d21, [x27], #0x8\n" - "ldr d25, [x26], #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d25, [x28], #0x8\n" + "ldr d24, [x27], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #1, 4f\n" - "ld1 { v26.s }[2], [x28], #0x4\n" - "ld1 { v21.s }[2], [x27], #0x4\n" + "ld1 { v25.s }[2], [x28], #0x4\n" + "ld1 { v24.s }[2], [x27], #0x4\n" "mov x20, #0x2\n" - "ld1 { v25.s }[2], [x26], #0x4\n" - "ld1 { v24.s }[2], [x25], #0x4\n" - "ld1 { v23.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" - "ld1 { v20.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "ld1 { v22.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v26.h }[6], [x28]\n" - "ld1 { v21.h }[6], [x27]\n" - "ld1 { v25.h }[6], [x26]\n" - "ld1 { v24.h }[6], [x25]\n" - "ld1 { v23.h }[6], [x24]\n" - "ld1 { v22.h }[6], [x23]\n" - "ld1 { v20.h }[6], [x22]\n" - "ld1 { v19.h }[6], [x21]\n" + "ld1 { v25.h }[6], [x28]\n" + "ld1 { v24.h }[6], [x27]\n" + "ld1 { v23.h }[6], [x26]\n" + "ld1 { v22.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "ld1 { v19.h }[6], [x22]\n" + "ld1 { v18.h }[6], [x21]\n" "b 7f\n" "4:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 7f\n" - "ld1 { v26.h }[4], [x28]\n" - "ld1 { v21.h }[4], [x27]\n" + "ld1 { v25.h }[4], [x28]\n" + "ld1 { v24.h }[4], [x27]\n" "mov x20, #0x2\n" - "ld1 { v25.h }[4], [x26]\n" - "ld1 { v24.h }[4], [x25]\n" - "ld1 { v23.h }[4], [x24]\n" - "ld1 { v22.h }[4], [x23]\n" - "ld1 { v20.h }[4], [x22]\n" - "ld1 { v19.h }[4], [x21]\n" + "ld1 { v23.h }[4], [x26]\n" + "ld1 { v22.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "ld1 { v19.h }[4], [x22]\n" + "ld1 { v18.h }[4], [x21]\n" "b 7f\n" "5:" // odd_loads_2_0 "tbz %x[width], #1, 6f\n" - "ldr s26, [x28], #0x4\n" - "ldr s21, [x27], #0x4\n" + "ldr s25, [x28], #0x4\n" + "ldr s24, [x27], #0x4\n" "mov x20, #0x1\n" - "ldr s25, [x26], #0x4\n" - "ldr s24, [x25], #0x4\n" - "ldr s23, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s23, [x26], #0x4\n" + "ldr s22, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #0, 7f\n" - "ld1 { v26.h }[2], [x28]\n" - "ld1 { v21.h }[2], [x27]\n" - "ld1 { v25.h }[2], [x26]\n" - "ld1 { v24.h }[2], [x25]\n" - "ld1 { v23.h }[2], [x24]\n" - "ld1 { v22.h }[2], [x23]\n" - "ld1 { v20.h }[2], [x22]\n" - "ld1 { v19.h }[2], [x21]\n" + "ld1 { v25.h }[2], [x28]\n" + "ld1 { v24.h }[2], [x27]\n" + "ld1 { v23.h }[2], [x26]\n" + "ld1 { v22.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "ld1 { v19.h }[2], [x22]\n" + "ld1 { v18.h }[2], [x21]\n" "b 7f\n" "6:" // odd_loads_1_0 - "ldr h26, [x28, #0x0]\n" - "ldr h21, [x27, #0x0]\n" + "ldr h25, [x28, #0x0]\n" + "ldr h24, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr h25, [x26, #0x0]\n" - "ldr h24, [x25, #0x0]\n" - "ldr h23, [x24, #0x0]\n" - "ldr h22, [x23, #0x0]\n" - "ldr h20, [x22, #0x0]\n" - "ldr h19, [x21, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "ldr h22, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h19, [x22, #0x0]\n" + "ldr h18, [x21, #0x0]\n" "7:" // Odd load end "subs x20, x20, #0x1\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v25.2d, v24.2d\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v23.2d, v22.2d\n" - "zip1 v16.2d, v20.2d, v19.2d\n" + "zip1 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v21.2d, v20.2d\n" + "zip1 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 8f\n" - "zip2 v21.2d, v26.2d, v21.2d\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip2 v17.2d, v23.2d, v22.2d\n" - "zip2 v16.2d, v20.2d, v19.2d\n" + "zip2 v16.2d, v25.2d, v24.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v21.2d, v20.2d\n" + "zip2 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "8:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp index dfec14358b..730bfd6342 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp @@ -79,14 +79,14 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q23, [x28], #0x10\n" - "ldr q22, [x26], #0x10\n" - ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" - ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" - "ldr q21, [x24], #0x10\n" - "ldr q20, [x22], #0x10\n" - ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" - ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" + "ldr q17, [x28], #0x10\n" + "ldr q16, [x26], #0x10\n" + ".inst 0x0ea16a37 // bfcvtn v23.4h, v17.4s\n" + ".inst 0x0ea16a16 // bfcvtn v22.4h, v16.4s\n" + "ldr q17, [x24], #0x10\n" + "ldr q16, [x22], #0x10\n" + ".inst 0x0ea16a35 // bfcvtn v21.4h, v17.4s\n" + ".inst 0x0ea16a14 // bfcvtn v20.4h, v16.4s\n" "ldr q19, [x27], #0x10\n" "ldr q18, [x25], #0x10\n" "subs %x[width], %x[width], #0x4\n" @@ -114,51 +114,50 @@ void interleave_block<8, 4, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 6f\n" "tbz %x[width], #1, 4f\n" - "ldr d23, [x28], #0x8\n" - "ldr d19, [x27], #0x8\n" + "ldr d19, [x28], #0x8\n" + "ldr d23, [x27], #0x8\n" "mov x20, #0x1\n" - "ldr d22, [x26], #0x8\n" - "ldr d18, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d17, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d16, [x21], #0x8\n" + "ldr d18, [x26], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d21, [x23], #0x8\n" + "ldr d16, [x22], #0x8\n" + "ldr d20, [x21], #0x8\n" "tbz %x[width], #0, 5f\n" - "ld1 { v23.s }[2], [x28]\n" - "ld1 { v19.s }[2], [x27]\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v18.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "ld1 { v17.s }[2], [x23]\n" - "ld1 { v20.s }[2], [x22]\n" - "ld1 { v16.s }[2], [x21]\n" + "ld1 { v19.s }[2], [x28]\n" + "ld1 { v23.s }[2], [x27]\n" + "ld1 { v18.s }[2], [x26]\n" + "ld1 { v22.s }[2], [x25]\n" + "ld1 { v17.s }[2], [x24]\n" + "ld1 { v21.s }[2], [x23]\n" + "ld1 { v16.s }[2], [x22]\n" + "ld1 { v20.s }[2], [x21]\n" "b 5f\n" "4:" // odd_loads_1_0 - "ldr s23, [x28, #0x0]\n" - "ldr s19, [x27, #0x0]\n" + "ldr s19, [x28, #0x0]\n" + "ldr s23, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr s22, [x26, #0x0]\n" - "ldr s18, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "ldr s17, [x23, #0x0]\n" - "ldr s20, [x22, #0x0]\n" - "ldr s16, [x21, #0x0]\n" + "ldr s18, [x26, #0x0]\n" + "ldr s22, [x25, #0x0]\n" + "ldr s17, [x24, #0x0]\n" + "ldr s21, [x23, #0x0]\n" + "ldr s16, [x22, #0x0]\n" + "ldr s20, [x21, #0x0]\n" "5:" // Odd load end - ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" - ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" - ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" - ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" - ".inst 0x4ea16a77 // bfcvtn2 v23.8h, v19.4s\n" - ".inst 0x4ea16a56 // bfcvtn2 v22.8h, v18.4s\n" - "str q23, [%x[out_ptr], #0x0]\n" - ".inst 0x4ea16a35 // bfcvtn2 v21.8h, v17.4s\n" - ".inst 0x4ea16a14 // bfcvtn2 v20.8h, v16.4s\n" - "str q22, [%x[out_ptr], #0x10]\n" - "str q21, [%x[out_ptr], #0x20]\n" - "str q20, [%x[out_ptr], #0x30]\n" + ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" + ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" + ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16af3 // bfcvtn2 v19.8h, v23.4s\n" + ".inst 0x4ea16ad2 // bfcvtn2 v18.8h, v22.4s\n" + "str q19, [%x[out_ptr], #0x0]\n" + ".inst 0x4ea16ab1 // bfcvtn2 v17.8h, v21.4s\n" + ".inst 0x4ea16a90 // bfcvtn2 v16.8h, v20.4s\n" + "str q18, [%x[out_ptr], #0x10]\n" + "str q17, [%x[out_ptr], #0x20]\n" + "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "6:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp index 54f15f8a5c..15d8ddbe53 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp @@ -79,29 +79,29 @@ void interleave_block<8, 4, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q28, [x28], #0x10\n" - "ldr q27, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q18, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" "cmp %x[width], #0x10\n" - "ldr q22, [x26], #0x10\n" - "ldr q21, [x25], #0x10\n" - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" - "ldr q24, [x24], #0x10\n" + "ldr q17, [x26], #0x10\n" + "ldr q16, [x25], #0x10\n" + "zip1 v25.4s, v20.4s, v17.4s\n" + "zip1 v24.4s, v18.4s, v16.4s\n" + "ldr q19, [x24], #0x10\n" "ldr q23, [x23], #0x10\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" - "ldr q19, [x22], #0x10\n" - "ldr q18, [x21], #0x10\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" + "zip2 v22.4s, v20.4s, v17.4s\n" + "zip2 v21.4s, v18.4s, v16.4s\n" + "ldr q18, [x22], #0x10\n" + "ldr q16, [x21], #0x10\n" + "zip1 v20.4s, v19.4s, v18.4s\n" + "zip1 v17.4s, v23.4s, v16.4s\n" + "zip2 v19.4s, v19.4s, v18.4s\n" + "zip2 v18.4s, v23.4s, v16.4s\n" "prfm pldl1keep, [x28, #0x70]\n" "prfm pldl1keep, [x27, #0x70]\n" "prfm pldl1keep, [x26, #0x70]\n" "prfm pldl1keep, [x25, #0x70]\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x0]\n" "prfm pldl1keep, [x24, #0x70]\n" "prfm pldl1keep, [x23, #0x70]\n" @@ -109,7 +109,7 @@ void interleave_block<8, 4, VLType::None, false>( "str q16, [%x[out_ptr], #0x10]\n" "prfm pldl1keep, [x22, #0x70]\n" "prfm pldl1keep, [x21, #0x70]\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v25.4s, v24.4s\n" "str q16, [%x[out_ptr], #0x20]\n" "zip2 v16.4s, v20.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x30]\n" @@ -128,40 +128,40 @@ void interleave_block<8, 4, VLType::None, false>( "tbz %x[width], #3, 7f\n" "ldr d28, [x28], #0x8\n" "ldr d27, [x27], #0x8\n" - "ldr d22, [x26], #0x8\n" - "ldr d21, [x25], #0x8\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" "ldr d24, [x24], #0x8\n" "ldr d23, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" + "ldr d22, [x22], #0x8\n" + "ldr d21, [x21], #0x8\n" "tbz %x[width], #2, 5f\n" "ld1 { v28.s }[2], [x28], #0x4\n" "ld1 { v27.s }[2], [x27], #0x4\n" - "ld1 { v22.s }[2], [x26], #0x4\n" - "ld1 { v21.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x26], #0x4\n" + "ld1 { v25.s }[2], [x25], #0x4\n" "ld1 { v24.s }[2], [x24], #0x4\n" "ld1 { v23.s }[2], [x23], #0x4\n" - "ld1 { v19.s }[2], [x22], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v22.s }[2], [x22], #0x4\n" + "ld1 { v21.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 4f\n" "ld1 { v28.h }[6], [x28], #0x2\n" "ld1 { v27.h }[6], [x27], #0x2\n" "mov x20, #0x4\n" - "ld1 { v22.h }[6], [x26], #0x2\n" - "ld1 { v21.h }[6], [x25], #0x2\n" + "ld1 { v26.h }[6], [x26], #0x2\n" + "ld1 { v25.h }[6], [x25], #0x2\n" "ld1 { v24.h }[6], [x24], #0x2\n" "ld1 { v23.h }[6], [x23], #0x2\n" - "ld1 { v19.h }[6], [x22], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v22.h }[6], [x22], #0x2\n" + "ld1 { v21.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[14], [x28]\n" "ld1 { v27.b }[14], [x27]\n" - "ld1 { v22.b }[14], [x26]\n" - "ld1 { v21.b }[14], [x25]\n" + "ld1 { v26.b }[14], [x26]\n" + "ld1 { v25.b }[14], [x25]\n" "ld1 { v24.b }[14], [x24]\n" "ld1 { v23.b }[14], [x23]\n" - "ld1 { v19.b }[14], [x22]\n" - "ld1 { v18.b }[14], [x21]\n" + "ld1 { v22.b }[14], [x22]\n" + "ld1 { v21.b }[14], [x21]\n" "b 11f\n" "4:" // odd_loads_1_12 "mov x20, #0x3\n" @@ -169,33 +169,33 @@ void interleave_block<8, 4, VLType::None, false>( "ld1 { v28.b }[12], [x28]\n" "ld1 { v27.b }[12], [x27]\n" "mov x20, #0x4\n" - "ld1 { v22.b }[12], [x26]\n" - "ld1 { v21.b }[12], [x25]\n" + "ld1 { v26.b }[12], [x26]\n" + "ld1 { v25.b }[12], [x25]\n" "ld1 { v24.b }[12], [x24]\n" "ld1 { v23.b }[12], [x23]\n" - "ld1 { v19.b }[12], [x22]\n" - "ld1 { v18.b }[12], [x21]\n" + "ld1 { v22.b }[12], [x22]\n" + "ld1 { v21.b }[12], [x21]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" "ld1 { v28.h }[4], [x28], #0x2\n" "ld1 { v27.h }[4], [x27], #0x2\n" "mov x20, #0x3\n" - "ld1 { v22.h }[4], [x26], #0x2\n" - "ld1 { v21.h }[4], [x25], #0x2\n" + "ld1 { v26.h }[4], [x26], #0x2\n" + "ld1 { v25.h }[4], [x25], #0x2\n" "ld1 { v24.h }[4], [x24], #0x2\n" "ld1 { v23.h }[4], [x23], #0x2\n" - "ld1 { v19.h }[4], [x22], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v22.h }[4], [x22], #0x2\n" + "ld1 { v21.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[10], [x28]\n" "ld1 { v27.b }[10], [x27]\n" - "ld1 { v22.b }[10], [x26]\n" - "ld1 { v21.b }[10], [x25]\n" + "ld1 { v26.b }[10], [x26]\n" + "ld1 { v25.b }[10], [x25]\n" "ld1 { v24.b }[10], [x24]\n" "ld1 { v23.b }[10], [x23]\n" - "ld1 { v19.b }[10], [x22]\n" - "ld1 { v18.b }[10], [x21]\n" + "ld1 { v22.b }[10], [x22]\n" + "ld1 { v21.b }[10], [x21]\n" "b 11f\n" "6:" // odd_loads_1_8 "mov x20, #0x2\n" @@ -203,42 +203,42 @@ void interleave_block<8, 4, VLType::None, false>( "ld1 { v28.b }[8], [x28]\n" "ld1 { v27.b }[8], [x27]\n" "mov x20, #0x3\n" - "ld1 { v22.b }[8], [x26]\n" - "ld1 { v21.b }[8], [x25]\n" + "ld1 { v26.b }[8], [x26]\n" + "ld1 { v25.b }[8], [x25]\n" "ld1 { v24.b }[8], [x24]\n" "ld1 { v23.b }[8], [x23]\n" - "ld1 { v19.b }[8], [x22]\n" - "ld1 { v18.b }[8], [x21]\n" + "ld1 { v22.b }[8], [x22]\n" + "ld1 { v21.b }[8], [x21]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" "ldr s28, [x28], #0x4\n" "ldr s27, [x27], #0x4\n" - "ldr s22, [x26], #0x4\n" - "ldr s21, [x25], #0x4\n" + "ldr s26, [x26], #0x4\n" + "ldr s25, [x25], #0x4\n" "ldr s24, [x24], #0x4\n" "ldr s23, [x23], #0x4\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" + "ldr s22, [x22], #0x4\n" + "ldr s21, [x21], #0x4\n" "tbz %x[width], #1, 8f\n" "ld1 { v28.h }[2], [x28], #0x2\n" "ld1 { v27.h }[2], [x27], #0x2\n" "mov x20, #0x2\n" - "ld1 { v22.h }[2], [x26], #0x2\n" - "ld1 { v21.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "ld1 { v25.h }[2], [x25], #0x2\n" "ld1 { v24.h }[2], [x24], #0x2\n" "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v19.h }[2], [x22], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v22.h }[2], [x22], #0x2\n" + "ld1 { v21.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[6], [x28]\n" "ld1 { v27.b }[6], [x27]\n" - "ld1 { v22.b }[6], [x26]\n" - "ld1 { v21.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x26]\n" + "ld1 { v25.b }[6], [x25]\n" "ld1 { v24.b }[6], [x24]\n" "ld1 { v23.b }[6], [x23]\n" - "ld1 { v19.b }[6], [x22]\n" - "ld1 { v18.b }[6], [x21]\n" + "ld1 { v22.b }[6], [x22]\n" + "ld1 { v21.b }[6], [x21]\n" "b 11f\n" "8:" // odd_loads_1_4 "mov x20, #0x1\n" @@ -246,81 +246,80 @@ void interleave_block<8, 4, VLType::None, false>( "ld1 { v28.b }[4], [x28]\n" "ld1 { v27.b }[4], [x27]\n" "mov x20, #0x2\n" - "ld1 { v22.b }[4], [x26]\n" - "ld1 { v21.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x26]\n" + "ld1 { v25.b }[4], [x25]\n" "ld1 { v24.b }[4], [x24]\n" "ld1 { v23.b }[4], [x23]\n" - "ld1 { v19.b }[4], [x22]\n" - "ld1 { v18.b }[4], [x21]\n" + "ld1 { v22.b }[4], [x22]\n" + "ld1 { v21.b }[4], [x21]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" "ldr h28, [x28], #0x2\n" "ldr h27, [x27], #0x2\n" "mov x20, #0x1\n" - "ldr h22, [x26], #0x2\n" - "ldr h21, [x25], #0x2\n" + "ldr h26, [x26], #0x2\n" + "ldr h25, [x25], #0x2\n" "ldr h24, [x24], #0x2\n" "ldr h23, [x23], #0x2\n" - "ldr h19, [x22], #0x2\n" - "ldr h18, [x21], #0x2\n" + "ldr h22, [x22], #0x2\n" + "ldr h21, [x21], #0x2\n" "tbz %x[width], #0, 11f\n" "ld1 { v28.b }[2], [x28]\n" "ld1 { v27.b }[2], [x27]\n" - "ld1 { v22.b }[2], [x26]\n" - "ld1 { v21.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x26]\n" + "ld1 { v25.b }[2], [x25]\n" "ld1 { v24.b }[2], [x24]\n" "ld1 { v23.b }[2], [x23]\n" - "ld1 { v19.b }[2], [x22]\n" - "ld1 { v18.b }[2], [x21]\n" + "ld1 { v22.b }[2], [x22]\n" + "ld1 { v21.b }[2], [x21]\n" "b 11f\n" "10:" // odd_loads_1_0 "ldr b28, [x28, #0x0]\n" "ldr b27, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b22, [x26, #0x0]\n" - "ldr b21, [x25, #0x0]\n" + "ldr b26, [x26, #0x0]\n" + "ldr b25, [x25, #0x0]\n" "ldr b24, [x24, #0x0]\n" "ldr b23, [x23, #0x0]\n" - "ldr b19, [x22, #0x0]\n" - "ldr b18, [x21, #0x0]\n" + "ldr b22, [x22, #0x0]\n" + "ldr b21, [x21, #0x0]\n" "11:" // Odd load end - "zip1 v26.4s, v28.4s, v22.4s\n" - "zip1 v25.4s, v27.4s, v21.4s\n" + "zip1 v20.4s, v28.4s, v26.4s\n" + "zip1 v19.4s, v27.4s, v25.4s\n" "subs x20, x20, #0x1\n" - "zip1 v20.4s, v24.4s, v19.4s\n" - "zip1 v17.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v26.4s, v25.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v20.4s, v17.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" "subs x20, x20, #0x1\n" - "zip2 v16.4s, v26.4s, v25.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v20.4s, v17.4s\n" + "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v22.4s, v28.4s, v22.4s\n" - "zip2 v21.4s, v27.4s, v21.4s\n" + "zip2 v20.4s, v28.4s, v26.4s\n" + "zip2 v19.4s, v27.4s, v25.4s\n" "subs x20, x20, #0x1\n" - "zip2 v19.4s, v24.4s, v19.4s\n" - "zip2 v18.4s, v23.4s, v18.4s\n" - "zip1 v16.4s, v22.4s, v21.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip2 v17.4s, v23.4s, v21.4s\n" + "zip1 v16.4s, v20.4s, v19.4s\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v16.4s, v19.4s, v18.4s\n" + "zip1 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 12f\n" - "zip2 v17.4s, v22.4s, v21.4s\n" - "str q17, [%x[out_ptr], #0x0]\n" - "zip2 v16.4s, v19.4s, v18.4s\n" + "zip2 v16.4s, v20.4s, v19.4s\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.4s, v18.4s, v17.4s\n" "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "12:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp index 2db54126c0..6c41b5fdfb 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp @@ -153,202 +153,202 @@ void interleave_block<8, 4, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d30, [x28], #0x8\n" - "ldr d29, [x27], #0x8\n" - "ldr d28, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" + "ldr d29, [x28], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "ldr d26, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v30.s }[2], [x28], #0x4\n" - "ld1 { v29.s }[2], [x27], #0x4\n" - "ld1 { v28.s }[2], [x26], #0x4\n" - "ld1 { v27.s }[2], [x25], #0x4\n" - "ld1 { v20.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" - "ld1 { v19.s }[2], [x22], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v29.s }[2], [x28], #0x4\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v27.s }[2], [x26], #0x4\n" + "ld1 { v26.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x22], #0x4\n" + "ld1 { v22.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v30.h }[6], [x28], #0x2\n" - "ld1 { v29.h }[6], [x27], #0x2\n" + "ld1 { v29.h }[6], [x28], #0x2\n" + "ld1 { v28.h }[6], [x27], #0x2\n" "mov x20, #0x4\n" - "ld1 { v28.h }[6], [x26], #0x2\n" - "ld1 { v27.h }[6], [x25], #0x2\n" - "ld1 { v20.h }[6], [x24], #0x2\n" - "ld1 { v26.h }[6], [x23], #0x2\n" - "ld1 { v19.h }[6], [x22], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v27.h }[6], [x26], #0x2\n" + "ld1 { v26.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v24.h }[6], [x23], #0x2\n" + "ld1 { v23.h }[6], [x22], #0x2\n" + "ld1 { v22.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[14], [x28]\n" - "ld1 { v29.b }[14], [x27]\n" - "ld1 { v28.b }[14], [x26]\n" - "ld1 { v27.b }[14], [x25]\n" - "ld1 { v20.b }[14], [x24]\n" - "ld1 { v26.b }[14], [x23]\n" - "ld1 { v19.b }[14], [x22]\n" - "ld1 { v18.b }[14], [x21]\n" + "ld1 { v29.b }[14], [x28]\n" + "ld1 { v28.b }[14], [x27]\n" + "ld1 { v27.b }[14], [x26]\n" + "ld1 { v26.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v24.b }[14], [x23]\n" + "ld1 { v23.b }[14], [x22]\n" + "ld1 { v22.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x20, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[12], [x28]\n" - "ld1 { v29.b }[12], [x27]\n" + "ld1 { v29.b }[12], [x28]\n" + "ld1 { v28.b }[12], [x27]\n" "mov x20, #0x4\n" - "ld1 { v28.b }[12], [x26]\n" - "ld1 { v27.b }[12], [x25]\n" - "ld1 { v20.b }[12], [x24]\n" - "ld1 { v26.b }[12], [x23]\n" - "ld1 { v19.b }[12], [x22]\n" - "ld1 { v18.b }[12], [x21]\n" + "ld1 { v27.b }[12], [x26]\n" + "ld1 { v26.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v24.b }[12], [x23]\n" + "ld1 { v23.b }[12], [x22]\n" + "ld1 { v22.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v30.h }[4], [x28], #0x2\n" - "ld1 { v29.h }[4], [x27], #0x2\n" + "ld1 { v29.h }[4], [x28], #0x2\n" + "ld1 { v28.h }[4], [x27], #0x2\n" "mov x20, #0x3\n" - "ld1 { v28.h }[4], [x26], #0x2\n" - "ld1 { v27.h }[4], [x25], #0x2\n" - "ld1 { v20.h }[4], [x24], #0x2\n" - "ld1 { v26.h }[4], [x23], #0x2\n" - "ld1 { v19.h }[4], [x22], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v27.h }[4], [x26], #0x2\n" + "ld1 { v26.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v24.h }[4], [x23], #0x2\n" + "ld1 { v23.h }[4], [x22], #0x2\n" + "ld1 { v22.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[10], [x28]\n" - "ld1 { v29.b }[10], [x27]\n" - "ld1 { v28.b }[10], [x26]\n" - "ld1 { v27.b }[10], [x25]\n" - "ld1 { v20.b }[10], [x24]\n" - "ld1 { v26.b }[10], [x23]\n" - "ld1 { v19.b }[10], [x22]\n" - "ld1 { v18.b }[10], [x21]\n" + "ld1 { v29.b }[10], [x28]\n" + "ld1 { v28.b }[10], [x27]\n" + "ld1 { v27.b }[10], [x26]\n" + "ld1 { v26.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v24.b }[10], [x23]\n" + "ld1 { v23.b }[10], [x22]\n" + "ld1 { v22.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[8], [x28]\n" - "ld1 { v29.b }[8], [x27]\n" + "ld1 { v29.b }[8], [x28]\n" + "ld1 { v28.b }[8], [x27]\n" "mov x20, #0x3\n" - "ld1 { v28.b }[8], [x26]\n" - "ld1 { v27.b }[8], [x25]\n" - "ld1 { v20.b }[8], [x24]\n" - "ld1 { v26.b }[8], [x23]\n" - "ld1 { v19.b }[8], [x22]\n" - "ld1 { v18.b }[8], [x21]\n" + "ld1 { v27.b }[8], [x26]\n" + "ld1 { v26.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v24.b }[8], [x23]\n" + "ld1 { v23.b }[8], [x22]\n" + "ld1 { v22.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s30, [x28], #0x4\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "ldr s27, [x25], #0x4\n" - "ldr s20, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" + "ldr s29, [x28], #0x4\n" + "ldr s28, [x27], #0x4\n" + "ldr s27, [x26], #0x4\n" + "ldr s26, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s22, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "ld1 { v29.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x28], #0x2\n" + "ld1 { v28.h }[2], [x27], #0x2\n" "mov x20, #0x2\n" - "ld1 { v28.h }[2], [x26], #0x2\n" - "ld1 { v27.h }[2], [x25], #0x2\n" - "ld1 { v20.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v19.h }[2], [x22], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "ld1 { v26.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v22.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[6], [x28]\n" - "ld1 { v29.b }[6], [x27]\n" - "ld1 { v28.b }[6], [x26]\n" - "ld1 { v27.b }[6], [x25]\n" - "ld1 { v20.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v19.b }[6], [x22]\n" - "ld1 { v18.b }[6], [x21]\n" + "ld1 { v29.b }[6], [x28]\n" + "ld1 { v28.b }[6], [x27]\n" + "ld1 { v27.b }[6], [x26]\n" + "ld1 { v26.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v24.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v22.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[4], [x28]\n" - "ld1 { v29.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x28]\n" + "ld1 { v28.b }[4], [x27]\n" "mov x20, #0x2\n" - "ld1 { v28.b }[4], [x26]\n" - "ld1 { v27.b }[4], [x25]\n" - "ld1 { v20.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v19.b }[4], [x22]\n" - "ld1 { v18.b }[4], [x21]\n" + "ld1 { v27.b }[4], [x26]\n" + "ld1 { v26.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v24.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v22.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h30, [x28], #0x2\n" - "ldr h29, [x27], #0x2\n" + "ldr h29, [x28], #0x2\n" + "ldr h28, [x27], #0x2\n" "mov x20, #0x1\n" - "ldr h28, [x26], #0x2\n" - "ldr h27, [x25], #0x2\n" - "ldr h20, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h19, [x22], #0x2\n" - "ldr h18, [x21], #0x2\n" + "ldr h27, [x26], #0x2\n" + "ldr h26, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h24, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h22, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[2], [x28]\n" - "ld1 { v29.b }[2], [x27]\n" - "ld1 { v28.b }[2], [x26]\n" - "ld1 { v27.b }[2], [x25]\n" - "ld1 { v20.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v19.b }[2], [x22]\n" - "ld1 { v18.b }[2], [x21]\n" + "ld1 { v29.b }[2], [x28]\n" + "ld1 { v28.b }[2], [x27]\n" + "ld1 { v27.b }[2], [x26]\n" + "ld1 { v26.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v24.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v22.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b30, [x28, #0x0]\n" - "ldr b29, [x27, #0x0]\n" + "ldr b29, [x28, #0x0]\n" + "ldr b28, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b28, [x26, #0x0]\n" - "ldr b27, [x25, #0x0]\n" - "ldr b20, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b19, [x22, #0x0]\n" - "ldr b18, [x21, #0x0]\n" + "ldr b27, [x26, #0x0]\n" + "ldr b26, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b24, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b22, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v22.4s, v30.4s, v28.4s\n" "zip1 v21.4s, v29.4s, v27.4s\n" + "zip1 v20.4s, v28.4s, v26.4s\n" "subs x20, x20, #0x1\n" - "zip1 v17.4s, v20.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v18.4s\n" - "zip1 v25.4s, v22.4s, v21.4s\n" - "zip1 v24.4s, v17.4s, v16.4s\n" - "str q25, [%x[out_ptr], #0x0]\n" - "sadalp v2.8h, v25.16b\n" - "str q24, [%x[out_ptr], #0x10]\n" - "sadalp v1.8h, v24.16b\n" + "zip1 v19.4s, v25.4s, v23.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v21.4s, v20.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v23.4s, v22.4s, v21.4s\n" - "zip2 v22.4s, v17.4s, v16.4s\n" + "zip2 v17.4s, v21.4s, v20.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "subs x20, x20, #0x1\n" - "str q23, [%x[out_ptr], #0x0]\n" - "sadalp v2.8h, v23.16b\n" - "str q22, [%x[out_ptr], #0x10]\n" - "sadalp v1.8h, v22.16b\n" + "str q17, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v21.4s, v30.4s, v28.4s\n" - "zip2 v17.4s, v29.4s, v27.4s\n" + "zip2 v21.4s, v29.4s, v27.4s\n" + "zip2 v20.4s, v28.4s, v26.4s\n" "subs x20, x20, #0x1\n" - "zip2 v20.4s, v20.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v18.4s\n" - "zip1 v19.4s, v21.4s, v17.4s\n" - "zip1 v18.4s, v20.4s, v16.4s\n" - "str q19, [%x[out_ptr], #0x0]\n" - "sadalp v2.8h, v19.16b\n" - "str q18, [%x[out_ptr], #0x10]\n" - "sadalp v1.8h, v18.16b\n" + "zip2 v19.4s, v25.4s, v23.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v21.4s, v20.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "sadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "sadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v21.4s, v17.4s\n" - "zip2 v16.4s, v20.4s, v16.4s\n" + "zip2 v17.4s, v21.4s, v20.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q17, [%x[out_ptr], #0x0]\n" "sadalp v2.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp index 44a79c0f0a..17eb7d5556 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp @@ -153,202 +153,202 @@ void interleave_block<8, 4, VLType::None, true>( "5:" // Main loop skip "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" - "ldr d30, [x28], #0x8\n" - "ldr d29, [x27], #0x8\n" - "ldr d28, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d19, [x22], #0x8\n" - "ldr d18, [x21], #0x8\n" + "ldr d29, [x28], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "ldr d26, [x25], #0x8\n" + "ldr d25, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d23, [x22], #0x8\n" + "ldr d22, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" - "ld1 { v30.s }[2], [x28], #0x4\n" - "ld1 { v29.s }[2], [x27], #0x4\n" - "ld1 { v28.s }[2], [x26], #0x4\n" - "ld1 { v27.s }[2], [x25], #0x4\n" - "ld1 { v20.s }[2], [x24], #0x4\n" - "ld1 { v26.s }[2], [x23], #0x4\n" - "ld1 { v19.s }[2], [x22], #0x4\n" - "ld1 { v18.s }[2], [x21], #0x4\n" + "ld1 { v29.s }[2], [x28], #0x4\n" + "ld1 { v28.s }[2], [x27], #0x4\n" + "ld1 { v27.s }[2], [x26], #0x4\n" + "ld1 { v26.s }[2], [x25], #0x4\n" + "ld1 { v25.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v23.s }[2], [x22], #0x4\n" + "ld1 { v22.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" - "ld1 { v30.h }[6], [x28], #0x2\n" - "ld1 { v29.h }[6], [x27], #0x2\n" + "ld1 { v29.h }[6], [x28], #0x2\n" + "ld1 { v28.h }[6], [x27], #0x2\n" "mov x20, #0x4\n" - "ld1 { v28.h }[6], [x26], #0x2\n" - "ld1 { v27.h }[6], [x25], #0x2\n" - "ld1 { v20.h }[6], [x24], #0x2\n" - "ld1 { v26.h }[6], [x23], #0x2\n" - "ld1 { v19.h }[6], [x22], #0x2\n" - "ld1 { v18.h }[6], [x21], #0x2\n" + "ld1 { v27.h }[6], [x26], #0x2\n" + "ld1 { v26.h }[6], [x25], #0x2\n" + "ld1 { v25.h }[6], [x24], #0x2\n" + "ld1 { v24.h }[6], [x23], #0x2\n" + "ld1 { v23.h }[6], [x22], #0x2\n" + "ld1 { v22.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[14], [x28]\n" - "ld1 { v29.b }[14], [x27]\n" - "ld1 { v28.b }[14], [x26]\n" - "ld1 { v27.b }[14], [x25]\n" - "ld1 { v20.b }[14], [x24]\n" - "ld1 { v26.b }[14], [x23]\n" - "ld1 { v19.b }[14], [x22]\n" - "ld1 { v18.b }[14], [x21]\n" + "ld1 { v29.b }[14], [x28]\n" + "ld1 { v28.b }[14], [x27]\n" + "ld1 { v27.b }[14], [x26]\n" + "ld1 { v26.b }[14], [x25]\n" + "ld1 { v25.b }[14], [x24]\n" + "ld1 { v24.b }[14], [x23]\n" + "ld1 { v23.b }[14], [x22]\n" + "ld1 { v22.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x20, #0x3\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[12], [x28]\n" - "ld1 { v29.b }[12], [x27]\n" + "ld1 { v29.b }[12], [x28]\n" + "ld1 { v28.b }[12], [x27]\n" "mov x20, #0x4\n" - "ld1 { v28.b }[12], [x26]\n" - "ld1 { v27.b }[12], [x25]\n" - "ld1 { v20.b }[12], [x24]\n" - "ld1 { v26.b }[12], [x23]\n" - "ld1 { v19.b }[12], [x22]\n" - "ld1 { v18.b }[12], [x21]\n" + "ld1 { v27.b }[12], [x26]\n" + "ld1 { v26.b }[12], [x25]\n" + "ld1 { v25.b }[12], [x24]\n" + "ld1 { v24.b }[12], [x23]\n" + "ld1 { v23.b }[12], [x22]\n" + "ld1 { v22.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" - "ld1 { v30.h }[4], [x28], #0x2\n" - "ld1 { v29.h }[4], [x27], #0x2\n" + "ld1 { v29.h }[4], [x28], #0x2\n" + "ld1 { v28.h }[4], [x27], #0x2\n" "mov x20, #0x3\n" - "ld1 { v28.h }[4], [x26], #0x2\n" - "ld1 { v27.h }[4], [x25], #0x2\n" - "ld1 { v20.h }[4], [x24], #0x2\n" - "ld1 { v26.h }[4], [x23], #0x2\n" - "ld1 { v19.h }[4], [x22], #0x2\n" - "ld1 { v18.h }[4], [x21], #0x2\n" + "ld1 { v27.h }[4], [x26], #0x2\n" + "ld1 { v26.h }[4], [x25], #0x2\n" + "ld1 { v25.h }[4], [x24], #0x2\n" + "ld1 { v24.h }[4], [x23], #0x2\n" + "ld1 { v23.h }[4], [x22], #0x2\n" + "ld1 { v22.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[10], [x28]\n" - "ld1 { v29.b }[10], [x27]\n" - "ld1 { v28.b }[10], [x26]\n" - "ld1 { v27.b }[10], [x25]\n" - "ld1 { v20.b }[10], [x24]\n" - "ld1 { v26.b }[10], [x23]\n" - "ld1 { v19.b }[10], [x22]\n" - "ld1 { v18.b }[10], [x21]\n" + "ld1 { v29.b }[10], [x28]\n" + "ld1 { v28.b }[10], [x27]\n" + "ld1 { v27.b }[10], [x26]\n" + "ld1 { v26.b }[10], [x25]\n" + "ld1 { v25.b }[10], [x24]\n" + "ld1 { v24.b }[10], [x23]\n" + "ld1 { v23.b }[10], [x22]\n" + "ld1 { v22.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[8], [x28]\n" - "ld1 { v29.b }[8], [x27]\n" + "ld1 { v29.b }[8], [x28]\n" + "ld1 { v28.b }[8], [x27]\n" "mov x20, #0x3\n" - "ld1 { v28.b }[8], [x26]\n" - "ld1 { v27.b }[8], [x25]\n" - "ld1 { v20.b }[8], [x24]\n" - "ld1 { v26.b }[8], [x23]\n" - "ld1 { v19.b }[8], [x22]\n" - "ld1 { v18.b }[8], [x21]\n" + "ld1 { v27.b }[8], [x26]\n" + "ld1 { v26.b }[8], [x25]\n" + "ld1 { v25.b }[8], [x24]\n" + "ld1 { v24.b }[8], [x23]\n" + "ld1 { v23.b }[8], [x22]\n" + "ld1 { v22.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" - "ldr s30, [x28], #0x4\n" - "ldr s29, [x27], #0x4\n" - "ldr s28, [x26], #0x4\n" - "ldr s27, [x25], #0x4\n" - "ldr s20, [x24], #0x4\n" - "ldr s26, [x23], #0x4\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" + "ldr s29, [x28], #0x4\n" + "ldr s28, [x27], #0x4\n" + "ldr s27, [x26], #0x4\n" + "ldr s26, [x25], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s23, [x22], #0x4\n" + "ldr s22, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "ld1 { v29.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x28], #0x2\n" + "ld1 { v28.h }[2], [x27], #0x2\n" "mov x20, #0x2\n" - "ld1 { v28.h }[2], [x26], #0x2\n" - "ld1 { v27.h }[2], [x25], #0x2\n" - "ld1 { v20.h }[2], [x24], #0x2\n" - "ld1 { v26.h }[2], [x23], #0x2\n" - "ld1 { v19.h }[2], [x22], #0x2\n" - "ld1 { v18.h }[2], [x21], #0x2\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "ld1 { v26.h }[2], [x25], #0x2\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "ld1 { v23.h }[2], [x22], #0x2\n" + "ld1 { v22.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[6], [x28]\n" - "ld1 { v29.b }[6], [x27]\n" - "ld1 { v28.b }[6], [x26]\n" - "ld1 { v27.b }[6], [x25]\n" - "ld1 { v20.b }[6], [x24]\n" - "ld1 { v26.b }[6], [x23]\n" - "ld1 { v19.b }[6], [x22]\n" - "ld1 { v18.b }[6], [x21]\n" + "ld1 { v29.b }[6], [x28]\n" + "ld1 { v28.b }[6], [x27]\n" + "ld1 { v27.b }[6], [x26]\n" + "ld1 { v26.b }[6], [x25]\n" + "ld1 { v25.b }[6], [x24]\n" + "ld1 { v24.b }[6], [x23]\n" + "ld1 { v23.b }[6], [x22]\n" + "ld1 { v22.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[4], [x28]\n" - "ld1 { v29.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x28]\n" + "ld1 { v28.b }[4], [x27]\n" "mov x20, #0x2\n" - "ld1 { v28.b }[4], [x26]\n" - "ld1 { v27.b }[4], [x25]\n" - "ld1 { v20.b }[4], [x24]\n" - "ld1 { v26.b }[4], [x23]\n" - "ld1 { v19.b }[4], [x22]\n" - "ld1 { v18.b }[4], [x21]\n" + "ld1 { v27.b }[4], [x26]\n" + "ld1 { v26.b }[4], [x25]\n" + "ld1 { v25.b }[4], [x24]\n" + "ld1 { v24.b }[4], [x23]\n" + "ld1 { v23.b }[4], [x22]\n" + "ld1 { v22.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" - "ldr h30, [x28], #0x2\n" - "ldr h29, [x27], #0x2\n" + "ldr h29, [x28], #0x2\n" + "ldr h28, [x27], #0x2\n" "mov x20, #0x1\n" - "ldr h28, [x26], #0x2\n" - "ldr h27, [x25], #0x2\n" - "ldr h20, [x24], #0x2\n" - "ldr h26, [x23], #0x2\n" - "ldr h19, [x22], #0x2\n" - "ldr h18, [x21], #0x2\n" + "ldr h27, [x26], #0x2\n" + "ldr h26, [x25], #0x2\n" + "ldr h25, [x24], #0x2\n" + "ldr h24, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "ldr h22, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" - "ld1 { v30.b }[2], [x28]\n" - "ld1 { v29.b }[2], [x27]\n" - "ld1 { v28.b }[2], [x26]\n" - "ld1 { v27.b }[2], [x25]\n" - "ld1 { v20.b }[2], [x24]\n" - "ld1 { v26.b }[2], [x23]\n" - "ld1 { v19.b }[2], [x22]\n" - "ld1 { v18.b }[2], [x21]\n" + "ld1 { v29.b }[2], [x28]\n" + "ld1 { v28.b }[2], [x27]\n" + "ld1 { v27.b }[2], [x26]\n" + "ld1 { v26.b }[2], [x25]\n" + "ld1 { v25.b }[2], [x24]\n" + "ld1 { v24.b }[2], [x23]\n" + "ld1 { v23.b }[2], [x22]\n" + "ld1 { v22.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 - "ldr b30, [x28, #0x0]\n" - "ldr b29, [x27, #0x0]\n" + "ldr b29, [x28, #0x0]\n" + "ldr b28, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b28, [x26, #0x0]\n" - "ldr b27, [x25, #0x0]\n" - "ldr b20, [x24, #0x0]\n" - "ldr b26, [x23, #0x0]\n" - "ldr b19, [x22, #0x0]\n" - "ldr b18, [x21, #0x0]\n" + "ldr b27, [x26, #0x0]\n" + "ldr b26, [x25, #0x0]\n" + "ldr b25, [x24, #0x0]\n" + "ldr b24, [x23, #0x0]\n" + "ldr b23, [x22, #0x0]\n" + "ldr b22, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v22.4s, v30.4s, v28.4s\n" "zip1 v21.4s, v29.4s, v27.4s\n" + "zip1 v20.4s, v28.4s, v26.4s\n" "subs x20, x20, #0x1\n" - "zip1 v17.4s, v20.4s, v19.4s\n" - "zip1 v16.4s, v26.4s, v18.4s\n" - "zip1 v25.4s, v22.4s, v21.4s\n" - "zip1 v24.4s, v17.4s, v16.4s\n" - "str q25, [%x[out_ptr], #0x0]\n" - "uadalp v2.8h, v25.16b\n" - "str q24, [%x[out_ptr], #0x10]\n" - "uadalp v1.8h, v24.16b\n" + "zip1 v19.4s, v25.4s, v23.4s\n" + "zip1 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v21.4s, v20.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v23.4s, v22.4s, v21.4s\n" - "zip2 v22.4s, v17.4s, v16.4s\n" + "zip2 v17.4s, v21.4s, v20.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "subs x20, x20, #0x1\n" - "str q23, [%x[out_ptr], #0x0]\n" - "uadalp v2.8h, v23.16b\n" - "str q22, [%x[out_ptr], #0x10]\n" - "uadalp v1.8h, v22.16b\n" + "str q17, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v21.4s, v30.4s, v28.4s\n" - "zip2 v17.4s, v29.4s, v27.4s\n" + "zip2 v21.4s, v29.4s, v27.4s\n" + "zip2 v20.4s, v28.4s, v26.4s\n" "subs x20, x20, #0x1\n" - "zip2 v20.4s, v20.4s, v19.4s\n" - "zip2 v16.4s, v26.4s, v18.4s\n" - "zip1 v19.4s, v21.4s, v17.4s\n" - "zip1 v18.4s, v20.4s, v16.4s\n" - "str q19, [%x[out_ptr], #0x0]\n" - "uadalp v2.8h, v19.16b\n" - "str q18, [%x[out_ptr], #0x10]\n" - "uadalp v1.8h, v18.16b\n" + "zip2 v19.4s, v25.4s, v23.4s\n" + "zip2 v18.4s, v24.4s, v22.4s\n" + "zip1 v17.4s, v21.4s, v20.4s\n" + "zip1 v16.4s, v19.4s, v18.4s\n" + "str q17, [%x[out_ptr], #0x0]\n" + "uadalp v2.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x10]\n" + "uadalp v1.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" "beq 14f\n" - "zip2 v17.4s, v21.4s, v17.4s\n" - "zip2 v16.4s, v20.4s, v16.4s\n" + "zip2 v17.4s, v21.4s, v20.4s\n" + "zip2 v16.4s, v19.4s, v18.4s\n" "str q17, [%x[out_ptr], #0x0]\n" "uadalp v2.8h, v17.16b\n" "str q16, [%x[out_ptr], #0x10]\n" diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp index 4bfb36082e..7b445ef3d4 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp @@ -79,18 +79,18 @@ void interleave_block<8, 8, VLType::None, false>( "prfm pldl1keep, [x21, #0x40]\n" "blt 3f\n" "2:" // Main loop head - "ldr q26, [x28], #0x10\n" - "ldr q21, [x27], #0x10\n" + "ldr q20, [x28], #0x10\n" + "ldr q19, [x27], #0x10\n" "subs %x[width], %x[width], #0x10\n" "cmp %x[width], #0x10\n" "ldr q25, [x26], #0x10\n" "ldr q24, [x25], #0x10\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v20.2d, v19.2d\n" "zip1 v18.2d, v25.2d, v24.2d\n" "ldr q23, [x24], #0x10\n" "ldr q22, [x23], #0x10\n" "zip1 v17.2d, v23.2d, v22.2d\n" - "zip2 v21.2d, v26.2d, v21.2d\n" + "zip2 v21.2d, v20.2d, v19.2d\n" "ldr q20, [x22], #0x10\n" "ldr q19, [x21], #0x10\n" "str q16, [%x[out_ptr], #0x0]\n" @@ -118,188 +118,187 @@ void interleave_block<8, 8, VLType::None, false>( "3:" // Main loop skip "cbz %x[width], 12f\n" "tbz %x[width], #3, 7f\n" - "ldr d26, [x28], #0x8\n" - "ldr d21, [x27], #0x8\n" - "ldr d25, [x26], #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "ldr d22, [x23], #0x8\n" - "ldr d20, [x22], #0x8\n" - "ldr d19, [x21], #0x8\n" + "ldr d25, [x28], #0x8\n" + "ldr d24, [x27], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d22, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d19, [x22], #0x8\n" + "ldr d18, [x21], #0x8\n" "tbz %x[width], #2, 5f\n" - "ld1 { v26.s }[2], [x28], #0x4\n" - "ld1 { v21.s }[2], [x27], #0x4\n" - "ld1 { v25.s }[2], [x26], #0x4\n" - "ld1 { v24.s }[2], [x25], #0x4\n" - "ld1 { v23.s }[2], [x24], #0x4\n" - "ld1 { v22.s }[2], [x23], #0x4\n" - "ld1 { v20.s }[2], [x22], #0x4\n" - "ld1 { v19.s }[2], [x21], #0x4\n" + "ld1 { v25.s }[2], [x28], #0x4\n" + "ld1 { v24.s }[2], [x27], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "ld1 { v22.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v19.s }[2], [x22], #0x4\n" + "ld1 { v18.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 4f\n" - "ld1 { v26.h }[6], [x28], #0x2\n" - "ld1 { v21.h }[6], [x27], #0x2\n" + "ld1 { v25.h }[6], [x28], #0x2\n" + "ld1 { v24.h }[6], [x27], #0x2\n" "mov x20, #0x2\n" - "ld1 { v25.h }[6], [x26], #0x2\n" - "ld1 { v24.h }[6], [x25], #0x2\n" - "ld1 { v23.h }[6], [x24], #0x2\n" - "ld1 { v22.h }[6], [x23], #0x2\n" - "ld1 { v20.h }[6], [x22], #0x2\n" - "ld1 { v19.h }[6], [x21], #0x2\n" + "ld1 { v23.h }[6], [x26], #0x2\n" + "ld1 { v22.h }[6], [x25], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v20.h }[6], [x23], #0x2\n" + "ld1 { v19.h }[6], [x22], #0x2\n" + "ld1 { v18.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[14], [x28]\n" - "ld1 { v21.b }[14], [x27]\n" - "ld1 { v25.b }[14], [x26]\n" - "ld1 { v24.b }[14], [x25]\n" - "ld1 { v23.b }[14], [x24]\n" - "ld1 { v22.b }[14], [x23]\n" - "ld1 { v20.b }[14], [x22]\n" - "ld1 { v19.b }[14], [x21]\n" + "ld1 { v25.b }[14], [x28]\n" + "ld1 { v24.b }[14], [x27]\n" + "ld1 { v23.b }[14], [x26]\n" + "ld1 { v22.b }[14], [x25]\n" + "ld1 { v21.b }[14], [x24]\n" + "ld1 { v20.b }[14], [x23]\n" + "ld1 { v19.b }[14], [x22]\n" + "ld1 { v18.b }[14], [x21]\n" "b 11f\n" "4:" // odd_loads_1_12 "mov x20, #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[12], [x28]\n" - "ld1 { v21.b }[12], [x27]\n" - "ld1 { v25.b }[12], [x26]\n" - "ld1 { v24.b }[12], [x25]\n" - "ld1 { v23.b }[12], [x24]\n" - "ld1 { v22.b }[12], [x23]\n" - "ld1 { v20.b }[12], [x22]\n" - "ld1 { v19.b }[12], [x21]\n" + "ld1 { v25.b }[12], [x28]\n" + "ld1 { v24.b }[12], [x27]\n" + "ld1 { v23.b }[12], [x26]\n" + "ld1 { v22.b }[12], [x25]\n" + "ld1 { v21.b }[12], [x24]\n" + "ld1 { v20.b }[12], [x23]\n" + "ld1 { v19.b }[12], [x22]\n" + "ld1 { v18.b }[12], [x21]\n" "b 11f\n" "5:" // odd_loads_2_8 "tbz %x[width], #1, 6f\n" - "ld1 { v26.h }[4], [x28], #0x2\n" - "ld1 { v21.h }[4], [x27], #0x2\n" + "ld1 { v25.h }[4], [x28], #0x2\n" + "ld1 { v24.h }[4], [x27], #0x2\n" "mov x20, #0x2\n" - "ld1 { v25.h }[4], [x26], #0x2\n" - "ld1 { v24.h }[4], [x25], #0x2\n" - "ld1 { v23.h }[4], [x24], #0x2\n" - "ld1 { v22.h }[4], [x23], #0x2\n" - "ld1 { v20.h }[4], [x22], #0x2\n" - "ld1 { v19.h }[4], [x21], #0x2\n" + "ld1 { v23.h }[4], [x26], #0x2\n" + "ld1 { v22.h }[4], [x25], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v20.h }[4], [x23], #0x2\n" + "ld1 { v19.h }[4], [x22], #0x2\n" + "ld1 { v18.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[10], [x28]\n" - "ld1 { v21.b }[10], [x27]\n" - "ld1 { v25.b }[10], [x26]\n" - "ld1 { v24.b }[10], [x25]\n" - "ld1 { v23.b }[10], [x24]\n" - "ld1 { v22.b }[10], [x23]\n" - "ld1 { v20.b }[10], [x22]\n" - "ld1 { v19.b }[10], [x21]\n" + "ld1 { v25.b }[10], [x28]\n" + "ld1 { v24.b }[10], [x27]\n" + "ld1 { v23.b }[10], [x26]\n" + "ld1 { v22.b }[10], [x25]\n" + "ld1 { v21.b }[10], [x24]\n" + "ld1 { v20.b }[10], [x23]\n" + "ld1 { v19.b }[10], [x22]\n" + "ld1 { v18.b }[10], [x21]\n" "b 11f\n" "6:" // odd_loads_1_8 "mov x20, #0x1\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[8], [x28]\n" - "ld1 { v21.b }[8], [x27]\n" + "ld1 { v25.b }[8], [x28]\n" + "ld1 { v24.b }[8], [x27]\n" "mov x20, #0x2\n" - "ld1 { v25.b }[8], [x26]\n" - "ld1 { v24.b }[8], [x25]\n" - "ld1 { v23.b }[8], [x24]\n" - "ld1 { v22.b }[8], [x23]\n" - "ld1 { v20.b }[8], [x22]\n" - "ld1 { v19.b }[8], [x21]\n" + "ld1 { v23.b }[8], [x26]\n" + "ld1 { v22.b }[8], [x25]\n" + "ld1 { v21.b }[8], [x24]\n" + "ld1 { v20.b }[8], [x23]\n" + "ld1 { v19.b }[8], [x22]\n" + "ld1 { v18.b }[8], [x21]\n" "b 11f\n" "7:" // odd_loads_4_0 "tbz %x[width], #2, 9f\n" - "ldr s26, [x28], #0x4\n" - "ldr s21, [x27], #0x4\n" - "ldr s25, [x26], #0x4\n" - "ldr s24, [x25], #0x4\n" - "ldr s23, [x24], #0x4\n" - "ldr s22, [x23], #0x4\n" - "ldr s20, [x22], #0x4\n" - "ldr s19, [x21], #0x4\n" + "ldr s25, [x28], #0x4\n" + "ldr s24, [x27], #0x4\n" + "ldr s23, [x26], #0x4\n" + "ldr s22, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s19, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" "tbz %x[width], #1, 8f\n" - "ld1 { v26.h }[2], [x28], #0x2\n" - "ld1 { v21.h }[2], [x27], #0x2\n" + "ld1 { v25.h }[2], [x28], #0x2\n" + "ld1 { v24.h }[2], [x27], #0x2\n" "mov x20, #0x1\n" - "ld1 { v25.h }[2], [x26], #0x2\n" - "ld1 { v24.h }[2], [x25], #0x2\n" - "ld1 { v23.h }[2], [x24], #0x2\n" - "ld1 { v22.h }[2], [x23], #0x2\n" - "ld1 { v20.h }[2], [x22], #0x2\n" - "ld1 { v19.h }[2], [x21], #0x2\n" + "ld1 { v23.h }[2], [x26], #0x2\n" + "ld1 { v22.h }[2], [x25], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v20.h }[2], [x23], #0x2\n" + "ld1 { v19.h }[2], [x22], #0x2\n" + "ld1 { v18.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[6], [x28]\n" - "ld1 { v21.b }[6], [x27]\n" - "ld1 { v25.b }[6], [x26]\n" - "ld1 { v24.b }[6], [x25]\n" - "ld1 { v23.b }[6], [x24]\n" - "ld1 { v22.b }[6], [x23]\n" - "ld1 { v20.b }[6], [x22]\n" - "ld1 { v19.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x28]\n" + "ld1 { v24.b }[6], [x27]\n" + "ld1 { v23.b }[6], [x26]\n" + "ld1 { v22.b }[6], [x25]\n" + "ld1 { v21.b }[6], [x24]\n" + "ld1 { v20.b }[6], [x23]\n" + "ld1 { v19.b }[6], [x22]\n" + "ld1 { v18.b }[6], [x21]\n" "b 11f\n" "8:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[4], [x28]\n" - "ld1 { v21.b }[4], [x27]\n" - "ld1 { v25.b }[4], [x26]\n" - "ld1 { v24.b }[4], [x25]\n" - "ld1 { v23.b }[4], [x24]\n" - "ld1 { v22.b }[4], [x23]\n" - "ld1 { v20.b }[4], [x22]\n" - "ld1 { v19.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x28]\n" + "ld1 { v24.b }[4], [x27]\n" + "ld1 { v23.b }[4], [x26]\n" + "ld1 { v22.b }[4], [x25]\n" + "ld1 { v21.b }[4], [x24]\n" + "ld1 { v20.b }[4], [x23]\n" + "ld1 { v19.b }[4], [x22]\n" + "ld1 { v18.b }[4], [x21]\n" "b 11f\n" "9:" // odd_loads_2_0 "tbz %x[width], #1, 10f\n" - "ldr h26, [x28], #0x2\n" - "ldr h21, [x27], #0x2\n" + "ldr h25, [x28], #0x2\n" + "ldr h24, [x27], #0x2\n" "mov x20, #0x1\n" - "ldr h25, [x26], #0x2\n" - "ldr h24, [x25], #0x2\n" - "ldr h23, [x24], #0x2\n" - "ldr h22, [x23], #0x2\n" - "ldr h20, [x22], #0x2\n" - "ldr h19, [x21], #0x2\n" + "ldr h23, [x26], #0x2\n" + "ldr h22, [x25], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h20, [x23], #0x2\n" + "ldr h19, [x22], #0x2\n" + "ldr h18, [x21], #0x2\n" "tbz %x[width], #0, 11f\n" - "ld1 { v26.b }[2], [x28]\n" - "ld1 { v21.b }[2], [x27]\n" - "ld1 { v25.b }[2], [x26]\n" - "ld1 { v24.b }[2], [x25]\n" - "ld1 { v23.b }[2], [x24]\n" - "ld1 { v22.b }[2], [x23]\n" - "ld1 { v20.b }[2], [x22]\n" - "ld1 { v19.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x28]\n" + "ld1 { v24.b }[2], [x27]\n" + "ld1 { v23.b }[2], [x26]\n" + "ld1 { v22.b }[2], [x25]\n" + "ld1 { v21.b }[2], [x24]\n" + "ld1 { v20.b }[2], [x23]\n" + "ld1 { v19.b }[2], [x22]\n" + "ld1 { v18.b }[2], [x21]\n" "b 11f\n" "10:" // odd_loads_1_0 - "ldr b26, [x28, #0x0]\n" - "ldr b21, [x27, #0x0]\n" + "ldr b25, [x28, #0x0]\n" + "ldr b24, [x27, #0x0]\n" "mov x20, #0x1\n" - "ldr b25, [x26, #0x0]\n" - "ldr b24, [x25, #0x0]\n" - "ldr b23, [x24, #0x0]\n" - "ldr b22, [x23, #0x0]\n" - "ldr b20, [x22, #0x0]\n" - "ldr b19, [x21, #0x0]\n" + "ldr b23, [x26, #0x0]\n" + "ldr b22, [x25, #0x0]\n" + "ldr b21, [x24, #0x0]\n" + "ldr b20, [x23, #0x0]\n" + "ldr b19, [x22, #0x0]\n" + "ldr b18, [x21, #0x0]\n" "11:" // Odd load end "subs x20, x20, #0x1\n" - "zip1 v16.2d, v26.2d, v21.2d\n" + "zip1 v16.2d, v25.2d, v24.2d\n" "str q16, [%x[out_ptr], #0x0]\n" - "zip1 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip1 v17.2d, v23.2d, v22.2d\n" - "zip1 v16.2d, v20.2d, v19.2d\n" + "zip1 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip1 v17.2d, v21.2d, v20.2d\n" + "zip1 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 12f\n" - "zip2 v21.2d, v26.2d, v21.2d\n" - "str q21, [%x[out_ptr], #0x0]\n" - "zip2 v18.2d, v25.2d, v24.2d\n" - "str q18, [%x[out_ptr], #0x10]\n" - "zip2 v17.2d, v23.2d, v22.2d\n" - "zip2 v16.2d, v20.2d, v19.2d\n" + "zip2 v16.2d, v25.2d, v24.2d\n" + "str q16, [%x[out_ptr], #0x0]\n" + "zip2 v16.2d, v23.2d, v22.2d\n" + "str q16, [%x[out_ptr], #0x10]\n" + "zip2 v17.2d, v21.2d, v20.2d\n" + "zip2 v16.2d, v19.2d, v18.2d\n" "str q17, [%x[out_ptr], #0x20]\n" "str q16, [%x[out_ptr], #0x30]\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "12:" // Odds skip - : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) - : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp index c6ad2949f5..a2288e8299 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp @@ -156,182 +156,182 @@ void interleave_block<8, 8, VLType::None, true>( "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" "ldr d27, [x28], #0x8\n" - "ldr d19, [x27], #0x8\n" + "ldr d26, [x27], #0x8\n" "ldr d25, [x26], #0x8\n" - "ldr d18, [x25], #0x8\n" + "ldr d24, [x25], #0x8\n" "ldr d23, [x24], #0x8\n" - "ldr d17, [x23], #0x8\n" + "ldr d22, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" - "ldr d16, [x21], #0x8\n" + "ldr d20, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" "ld1 { v27.s }[2], [x28], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v26.s }[2], [x27], #0x4\n" "ld1 { v25.s }[2], [x26], #0x4\n" - "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x25], #0x4\n" "ld1 { v23.s }[2], [x24], #0x4\n" - "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v16.s }[2], [x21], #0x4\n" + "ld1 { v20.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v27.h }[6], [x28], #0x2\n" - "ld1 { v19.h }[6], [x27], #0x2\n" + "ld1 { v26.h }[6], [x27], #0x2\n" "mov x20, #0x2\n" "ld1 { v25.h }[6], [x26], #0x2\n" - "ld1 { v18.h }[6], [x25], #0x2\n" + "ld1 { v24.h }[6], [x25], #0x2\n" "ld1 { v23.h }[6], [x24], #0x2\n" - "ld1 { v17.h }[6], [x23], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" "ld1 { v21.h }[6], [x22], #0x2\n" - "ld1 { v16.h }[6], [x21], #0x2\n" + "ld1 { v20.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[14], [x28]\n" - "ld1 { v19.b }[14], [x27]\n" + "ld1 { v26.b }[14], [x27]\n" "ld1 { v25.b }[14], [x26]\n" - "ld1 { v18.b }[14], [x25]\n" + "ld1 { v24.b }[14], [x25]\n" "ld1 { v23.b }[14], [x24]\n" - "ld1 { v17.b }[14], [x23]\n" + "ld1 { v22.b }[14], [x23]\n" "ld1 { v21.b }[14], [x22]\n" - "ld1 { v16.b }[14], [x21]\n" + "ld1 { v20.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[12], [x28]\n" - "ld1 { v19.b }[12], [x27]\n" + "ld1 { v26.b }[12], [x27]\n" "ld1 { v25.b }[12], [x26]\n" - "ld1 { v18.b }[12], [x25]\n" + "ld1 { v24.b }[12], [x25]\n" "ld1 { v23.b }[12], [x24]\n" - "ld1 { v17.b }[12], [x23]\n" + "ld1 { v22.b }[12], [x23]\n" "ld1 { v21.b }[12], [x22]\n" - "ld1 { v16.b }[12], [x21]\n" + "ld1 { v20.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" "ld1 { v27.h }[4], [x28], #0x2\n" - "ld1 { v19.h }[4], [x27], #0x2\n" + "ld1 { v26.h }[4], [x27], #0x2\n" "mov x20, #0x2\n" "ld1 { v25.h }[4], [x26], #0x2\n" - "ld1 { v18.h }[4], [x25], #0x2\n" + "ld1 { v24.h }[4], [x25], #0x2\n" "ld1 { v23.h }[4], [x24], #0x2\n" - "ld1 { v17.h }[4], [x23], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" "ld1 { v21.h }[4], [x22], #0x2\n" - "ld1 { v16.h }[4], [x21], #0x2\n" + "ld1 { v20.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[10], [x28]\n" - "ld1 { v19.b }[10], [x27]\n" + "ld1 { v26.b }[10], [x27]\n" "ld1 { v25.b }[10], [x26]\n" - "ld1 { v18.b }[10], [x25]\n" + "ld1 { v24.b }[10], [x25]\n" "ld1 { v23.b }[10], [x24]\n" - "ld1 { v17.b }[10], [x23]\n" + "ld1 { v22.b }[10], [x23]\n" "ld1 { v21.b }[10], [x22]\n" - "ld1 { v16.b }[10], [x21]\n" + "ld1 { v20.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[8], [x28]\n" - "ld1 { v19.b }[8], [x27]\n" + "ld1 { v26.b }[8], [x27]\n" "mov x20, #0x2\n" "ld1 { v25.b }[8], [x26]\n" - "ld1 { v18.b }[8], [x25]\n" + "ld1 { v24.b }[8], [x25]\n" "ld1 { v23.b }[8], [x24]\n" - "ld1 { v17.b }[8], [x23]\n" + "ld1 { v22.b }[8], [x23]\n" "ld1 { v21.b }[8], [x22]\n" - "ld1 { v16.b }[8], [x21]\n" + "ld1 { v20.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" "ldr s27, [x28], #0x4\n" - "ldr s19, [x27], #0x4\n" + "ldr s26, [x27], #0x4\n" "ldr s25, [x26], #0x4\n" - "ldr s18, [x25], #0x4\n" + "ldr s24, [x25], #0x4\n" "ldr s23, [x24], #0x4\n" - "ldr s17, [x23], #0x4\n" + "ldr s22, [x23], #0x4\n" "ldr s21, [x22], #0x4\n" - "ldr s16, [x21], #0x4\n" + "ldr s20, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" "ld1 { v27.h }[2], [x28], #0x2\n" - "ld1 { v19.h }[2], [x27], #0x2\n" + "ld1 { v26.h }[2], [x27], #0x2\n" "mov x20, #0x1\n" "ld1 { v25.h }[2], [x26], #0x2\n" - "ld1 { v18.h }[2], [x25], #0x2\n" + "ld1 { v24.h }[2], [x25], #0x2\n" "ld1 { v23.h }[2], [x24], #0x2\n" - "ld1 { v17.h }[2], [x23], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v16.h }[2], [x21], #0x2\n" + "ld1 { v20.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[6], [x28]\n" - "ld1 { v19.b }[6], [x27]\n" + "ld1 { v26.b }[6], [x27]\n" "ld1 { v25.b }[6], [x26]\n" - "ld1 { v18.b }[6], [x25]\n" + "ld1 { v24.b }[6], [x25]\n" "ld1 { v23.b }[6], [x24]\n" - "ld1 { v17.b }[6], [x23]\n" + "ld1 { v22.b }[6], [x23]\n" "ld1 { v21.b }[6], [x22]\n" - "ld1 { v16.b }[6], [x21]\n" + "ld1 { v20.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[4], [x28]\n" - "ld1 { v19.b }[4], [x27]\n" + "ld1 { v26.b }[4], [x27]\n" "ld1 { v25.b }[4], [x26]\n" - "ld1 { v18.b }[4], [x25]\n" + "ld1 { v24.b }[4], [x25]\n" "ld1 { v23.b }[4], [x24]\n" - "ld1 { v17.b }[4], [x23]\n" + "ld1 { v22.b }[4], [x23]\n" "ld1 { v21.b }[4], [x22]\n" - "ld1 { v16.b }[4], [x21]\n" + "ld1 { v20.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" "ldr h27, [x28], #0x2\n" - "ldr h19, [x27], #0x2\n" + "ldr h26, [x27], #0x2\n" "mov x20, #0x1\n" "ldr h25, [x26], #0x2\n" - "ldr h18, [x25], #0x2\n" + "ldr h24, [x25], #0x2\n" "ldr h23, [x24], #0x2\n" - "ldr h17, [x23], #0x2\n" + "ldr h22, [x23], #0x2\n" "ldr h21, [x22], #0x2\n" - "ldr h16, [x21], #0x2\n" + "ldr h20, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[2], [x28]\n" - "ld1 { v19.b }[2], [x27]\n" + "ld1 { v26.b }[2], [x27]\n" "ld1 { v25.b }[2], [x26]\n" - "ld1 { v18.b }[2], [x25]\n" + "ld1 { v24.b }[2], [x25]\n" "ld1 { v23.b }[2], [x24]\n" - "ld1 { v17.b }[2], [x23]\n" + "ld1 { v22.b }[2], [x23]\n" "ld1 { v21.b }[2], [x22]\n" - "ld1 { v16.b }[2], [x21]\n" + "ld1 { v20.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 "ldr b27, [x28, #0x0]\n" - "ldr b19, [x27, #0x0]\n" + "ldr b26, [x27, #0x0]\n" "mov x20, #0x1\n" "ldr b25, [x26, #0x0]\n" - "ldr b18, [x25, #0x0]\n" + "ldr b24, [x25, #0x0]\n" "ldr b23, [x24, #0x0]\n" - "ldr b17, [x23, #0x0]\n" + "ldr b22, [x23, #0x0]\n" "ldr b21, [x22, #0x0]\n" - "ldr b16, [x21, #0x0]\n" + "ldr b20, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v26.2d, v27.2d, v19.2d\n" - "zip1 v24.2d, v25.2d, v18.2d\n" + "zip1 v19.2d, v27.2d, v26.2d\n" + "zip1 v18.2d, v25.2d, v24.2d\n" "subs x20, x20, #0x1\n" - "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v22.2d, v23.2d, v17.2d\n" - "zip1 v20.2d, v21.2d, v16.2d\n" - "str q24, [%x[out_ptr], #0x10]\n" - "sadalp v5.8h, v26.16b\n" - "sadalp v4.8h, v24.16b\n" - "str q22, [%x[out_ptr], #0x20]\n" - "sadalp v3.8h, v22.16b\n" - "str q20, [%x[out_ptr], #0x30]\n" - "sadalp v2.8h, v20.16b\n" + "str q19, [%x[out_ptr], #0x0]\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip1 v16.2d, v21.2d, v20.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "sadalp v5.8h, v19.16b\n" + "sadalp v4.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "sadalp v3.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "sadalp v2.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" - "zip2 v19.2d, v27.2d, v19.2d\n" - "zip2 v18.2d, v25.2d, v18.2d\n" + "zip2 v19.2d, v27.2d, v26.2d\n" + "zip2 v18.2d, v25.2d, v24.2d\n" "str q19, [%x[out_ptr], #0x0]\n" - "zip2 v17.2d, v23.2d, v17.2d\n" - "zip2 v16.2d, v21.2d, v16.2d\n" + "zip2 v17.2d, v23.2d, v22.2d\n" + "zip2 v16.2d, v21.2d, v20.2d\n" "str q18, [%x[out_ptr], #0x10]\n" "sadalp v5.8h, v19.16b\n" "sadalp v4.8h, v18.16b\n" @@ -346,11 +346,11 @@ void interleave_block<8, 8, VLType::None, true>( "sadalp v31.4s, v3.8h\n" "sadalp v30.4s, v2.8h\n" "addp v1.4s, v1.4s, v0.4s\n" - "addp v0.4s, v31.4s, v30.4s\n" + "addp v16.4s, v31.4s, v30.4s\n" "add v1.4s, v1.4s, v29.4s\n" - "add v0.4s, v0.4s, v28.4s\n" + "add v16.4s, v16.4s, v28.4s\n" "str q1, [%x[out_ptr], #0x0]\n" - "str q0, [%x[out_ptr], #0x10]\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp index 6c4a5fa62b..56d34a8a64 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp @@ -156,182 +156,182 @@ void interleave_block<8, 8, VLType::None, true>( "cbz %x[width], 14f\n" "tbz %x[width], #3, 9f\n" "ldr d27, [x28], #0x8\n" - "ldr d19, [x27], #0x8\n" + "ldr d26, [x27], #0x8\n" "ldr d25, [x26], #0x8\n" - "ldr d18, [x25], #0x8\n" + "ldr d24, [x25], #0x8\n" "ldr d23, [x24], #0x8\n" - "ldr d17, [x23], #0x8\n" + "ldr d22, [x23], #0x8\n" "ldr d21, [x22], #0x8\n" - "ldr d16, [x21], #0x8\n" + "ldr d20, [x21], #0x8\n" "tbz %x[width], #2, 7f\n" "ld1 { v27.s }[2], [x28], #0x4\n" - "ld1 { v19.s }[2], [x27], #0x4\n" + "ld1 { v26.s }[2], [x27], #0x4\n" "ld1 { v25.s }[2], [x26], #0x4\n" - "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v24.s }[2], [x25], #0x4\n" "ld1 { v23.s }[2], [x24], #0x4\n" - "ld1 { v17.s }[2], [x23], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" "ld1 { v21.s }[2], [x22], #0x4\n" - "ld1 { v16.s }[2], [x21], #0x4\n" + "ld1 { v20.s }[2], [x21], #0x4\n" "tbz %x[width], #1, 6f\n" "ld1 { v27.h }[6], [x28], #0x2\n" - "ld1 { v19.h }[6], [x27], #0x2\n" + "ld1 { v26.h }[6], [x27], #0x2\n" "mov x20, #0x2\n" "ld1 { v25.h }[6], [x26], #0x2\n" - "ld1 { v18.h }[6], [x25], #0x2\n" + "ld1 { v24.h }[6], [x25], #0x2\n" "ld1 { v23.h }[6], [x24], #0x2\n" - "ld1 { v17.h }[6], [x23], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" "ld1 { v21.h }[6], [x22], #0x2\n" - "ld1 { v16.h }[6], [x21], #0x2\n" + "ld1 { v20.h }[6], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[14], [x28]\n" - "ld1 { v19.b }[14], [x27]\n" + "ld1 { v26.b }[14], [x27]\n" "ld1 { v25.b }[14], [x26]\n" - "ld1 { v18.b }[14], [x25]\n" + "ld1 { v24.b }[14], [x25]\n" "ld1 { v23.b }[14], [x24]\n" - "ld1 { v17.b }[14], [x23]\n" + "ld1 { v22.b }[14], [x23]\n" "ld1 { v21.b }[14], [x22]\n" - "ld1 { v16.b }[14], [x21]\n" + "ld1 { v20.b }[14], [x21]\n" "b 13f\n" "6:" // odd_loads_1_12 "mov x20, #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[12], [x28]\n" - "ld1 { v19.b }[12], [x27]\n" + "ld1 { v26.b }[12], [x27]\n" "ld1 { v25.b }[12], [x26]\n" - "ld1 { v18.b }[12], [x25]\n" + "ld1 { v24.b }[12], [x25]\n" "ld1 { v23.b }[12], [x24]\n" - "ld1 { v17.b }[12], [x23]\n" + "ld1 { v22.b }[12], [x23]\n" "ld1 { v21.b }[12], [x22]\n" - "ld1 { v16.b }[12], [x21]\n" + "ld1 { v20.b }[12], [x21]\n" "b 13f\n" "7:" // odd_loads_2_8 "tbz %x[width], #1, 8f\n" "ld1 { v27.h }[4], [x28], #0x2\n" - "ld1 { v19.h }[4], [x27], #0x2\n" + "ld1 { v26.h }[4], [x27], #0x2\n" "mov x20, #0x2\n" "ld1 { v25.h }[4], [x26], #0x2\n" - "ld1 { v18.h }[4], [x25], #0x2\n" + "ld1 { v24.h }[4], [x25], #0x2\n" "ld1 { v23.h }[4], [x24], #0x2\n" - "ld1 { v17.h }[4], [x23], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" "ld1 { v21.h }[4], [x22], #0x2\n" - "ld1 { v16.h }[4], [x21], #0x2\n" + "ld1 { v20.h }[4], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[10], [x28]\n" - "ld1 { v19.b }[10], [x27]\n" + "ld1 { v26.b }[10], [x27]\n" "ld1 { v25.b }[10], [x26]\n" - "ld1 { v18.b }[10], [x25]\n" + "ld1 { v24.b }[10], [x25]\n" "ld1 { v23.b }[10], [x24]\n" - "ld1 { v17.b }[10], [x23]\n" + "ld1 { v22.b }[10], [x23]\n" "ld1 { v21.b }[10], [x22]\n" - "ld1 { v16.b }[10], [x21]\n" + "ld1 { v20.b }[10], [x21]\n" "b 13f\n" "8:" // odd_loads_1_8 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[8], [x28]\n" - "ld1 { v19.b }[8], [x27]\n" + "ld1 { v26.b }[8], [x27]\n" "mov x20, #0x2\n" "ld1 { v25.b }[8], [x26]\n" - "ld1 { v18.b }[8], [x25]\n" + "ld1 { v24.b }[8], [x25]\n" "ld1 { v23.b }[8], [x24]\n" - "ld1 { v17.b }[8], [x23]\n" + "ld1 { v22.b }[8], [x23]\n" "ld1 { v21.b }[8], [x22]\n" - "ld1 { v16.b }[8], [x21]\n" + "ld1 { v20.b }[8], [x21]\n" "b 13f\n" "9:" // odd_loads_4_0 "tbz %x[width], #2, 11f\n" "ldr s27, [x28], #0x4\n" - "ldr s19, [x27], #0x4\n" + "ldr s26, [x27], #0x4\n" "ldr s25, [x26], #0x4\n" - "ldr s18, [x25], #0x4\n" + "ldr s24, [x25], #0x4\n" "ldr s23, [x24], #0x4\n" - "ldr s17, [x23], #0x4\n" + "ldr s22, [x23], #0x4\n" "ldr s21, [x22], #0x4\n" - "ldr s16, [x21], #0x4\n" + "ldr s20, [x21], #0x4\n" "tbz %x[width], #1, 10f\n" "ld1 { v27.h }[2], [x28], #0x2\n" - "ld1 { v19.h }[2], [x27], #0x2\n" + "ld1 { v26.h }[2], [x27], #0x2\n" "mov x20, #0x1\n" "ld1 { v25.h }[2], [x26], #0x2\n" - "ld1 { v18.h }[2], [x25], #0x2\n" + "ld1 { v24.h }[2], [x25], #0x2\n" "ld1 { v23.h }[2], [x24], #0x2\n" - "ld1 { v17.h }[2], [x23], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" "ld1 { v21.h }[2], [x22], #0x2\n" - "ld1 { v16.h }[2], [x21], #0x2\n" + "ld1 { v20.h }[2], [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[6], [x28]\n" - "ld1 { v19.b }[6], [x27]\n" + "ld1 { v26.b }[6], [x27]\n" "ld1 { v25.b }[6], [x26]\n" - "ld1 { v18.b }[6], [x25]\n" + "ld1 { v24.b }[6], [x25]\n" "ld1 { v23.b }[6], [x24]\n" - "ld1 { v17.b }[6], [x23]\n" + "ld1 { v22.b }[6], [x23]\n" "ld1 { v21.b }[6], [x22]\n" - "ld1 { v16.b }[6], [x21]\n" + "ld1 { v20.b }[6], [x21]\n" "b 13f\n" "10:" // odd_loads_1_4 "mov x20, #0x1\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[4], [x28]\n" - "ld1 { v19.b }[4], [x27]\n" + "ld1 { v26.b }[4], [x27]\n" "ld1 { v25.b }[4], [x26]\n" - "ld1 { v18.b }[4], [x25]\n" + "ld1 { v24.b }[4], [x25]\n" "ld1 { v23.b }[4], [x24]\n" - "ld1 { v17.b }[4], [x23]\n" + "ld1 { v22.b }[4], [x23]\n" "ld1 { v21.b }[4], [x22]\n" - "ld1 { v16.b }[4], [x21]\n" + "ld1 { v20.b }[4], [x21]\n" "b 13f\n" "11:" // odd_loads_2_0 "tbz %x[width], #1, 12f\n" "ldr h27, [x28], #0x2\n" - "ldr h19, [x27], #0x2\n" + "ldr h26, [x27], #0x2\n" "mov x20, #0x1\n" "ldr h25, [x26], #0x2\n" - "ldr h18, [x25], #0x2\n" + "ldr h24, [x25], #0x2\n" "ldr h23, [x24], #0x2\n" - "ldr h17, [x23], #0x2\n" + "ldr h22, [x23], #0x2\n" "ldr h21, [x22], #0x2\n" - "ldr h16, [x21], #0x2\n" + "ldr h20, [x21], #0x2\n" "tbz %x[width], #0, 13f\n" "ld1 { v27.b }[2], [x28]\n" - "ld1 { v19.b }[2], [x27]\n" + "ld1 { v26.b }[2], [x27]\n" "ld1 { v25.b }[2], [x26]\n" - "ld1 { v18.b }[2], [x25]\n" + "ld1 { v24.b }[2], [x25]\n" "ld1 { v23.b }[2], [x24]\n" - "ld1 { v17.b }[2], [x23]\n" + "ld1 { v22.b }[2], [x23]\n" "ld1 { v21.b }[2], [x22]\n" - "ld1 { v16.b }[2], [x21]\n" + "ld1 { v20.b }[2], [x21]\n" "b 13f\n" "12:" // odd_loads_1_0 "ldr b27, [x28, #0x0]\n" - "ldr b19, [x27, #0x0]\n" + "ldr b26, [x27, #0x0]\n" "mov x20, #0x1\n" "ldr b25, [x26, #0x0]\n" - "ldr b18, [x25, #0x0]\n" + "ldr b24, [x25, #0x0]\n" "ldr b23, [x24, #0x0]\n" - "ldr b17, [x23, #0x0]\n" + "ldr b22, [x23, #0x0]\n" "ldr b21, [x22, #0x0]\n" - "ldr b16, [x21, #0x0]\n" + "ldr b20, [x21, #0x0]\n" "13:" // Odd load end - "zip1 v26.2d, v27.2d, v19.2d\n" - "zip1 v24.2d, v25.2d, v18.2d\n" + "zip1 v19.2d, v27.2d, v26.2d\n" + "zip1 v18.2d, v25.2d, v24.2d\n" "subs x20, x20, #0x1\n" - "str q26, [%x[out_ptr], #0x0]\n" - "zip1 v22.2d, v23.2d, v17.2d\n" - "zip1 v20.2d, v21.2d, v16.2d\n" - "str q24, [%x[out_ptr], #0x10]\n" - "uadalp v5.8h, v26.16b\n" - "uadalp v4.8h, v24.16b\n" - "str q22, [%x[out_ptr], #0x20]\n" - "uadalp v3.8h, v22.16b\n" - "str q20, [%x[out_ptr], #0x30]\n" - "uadalp v2.8h, v20.16b\n" + "str q19, [%x[out_ptr], #0x0]\n" + "zip1 v17.2d, v23.2d, v22.2d\n" + "zip1 v16.2d, v21.2d, v20.2d\n" + "str q18, [%x[out_ptr], #0x10]\n" + "uadalp v5.8h, v19.16b\n" + "uadalp v4.8h, v18.16b\n" + "str q17, [%x[out_ptr], #0x20]\n" + "uadalp v3.8h, v17.16b\n" + "str q16, [%x[out_ptr], #0x30]\n" + "uadalp v2.8h, v16.16b\n" "add %x[out_ptr], %x[out_ptr], #0x40\n" "beq 14f\n" - "zip2 v19.2d, v27.2d, v19.2d\n" - "zip2 v18.2d, v25.2d, v18.2d\n" + "zip2 v19.2d, v27.2d, v26.2d\n" + "zip2 v18.2d, v25.2d, v24.2d\n" "str q19, [%x[out_ptr], #0x0]\n" - "zip2 v17.2d, v23.2d, v17.2d\n" - "zip2 v16.2d, v21.2d, v16.2d\n" + "zip2 v17.2d, v23.2d, v22.2d\n" + "zip2 v16.2d, v21.2d, v20.2d\n" "str q18, [%x[out_ptr], #0x10]\n" "uadalp v5.8h, v19.16b\n" "uadalp v4.8h, v18.16b\n" @@ -346,11 +346,11 @@ void interleave_block<8, 8, VLType::None, true>( "uadalp v31.4s, v3.8h\n" "uadalp v30.4s, v2.8h\n" "addp v1.4s, v1.4s, v0.4s\n" - "addp v0.4s, v31.4s, v30.4s\n" + "addp v16.4s, v31.4s, v30.4s\n" "add v1.4s, v1.4s, v29.4s\n" - "add v0.4s, v0.4s, v28.4s\n" + "add v16.4s, v16.4s, v28.4s\n" "str q1, [%x[out_ptr], #0x0]\n" - "str q0, [%x[out_ptr], #0x10]\n" + "str q16, [%x[out_ptr], #0x10]\n" "add %x[out_ptr], %x[out_ptr], #0x20\n" : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width) : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp index 51b91d16e1..a5f4754d3d 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave1VL_block2_fp32_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME2) template <> void interleave_block<1, 2, VLType::SME, false>( bfloat16 * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x22, ALL, MUL #2\n" @@ -153,4 +151,4 @@ void interleave_block<1, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp index 25bfad18b1..c1d0ac5bc7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave2VL_block2_fp32_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME2) template <> void interleave_block<2, 2, VLType::SME, false>( bfloat16 * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x22, ALL, MUL #2\n" @@ -184,4 +182,4 @@ void interleave_block<2, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp index 9255831e86..03575d7ff2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme2_interleave4VL_block2_fp32_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME2) template <> void interleave_block<4, 2, VLType::SME, false>( bfloat16 * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x23, ALL, MUL #2\n" @@ -159,4 +157,4 @@ void interleave_block<4, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp index 9b66a6fb10..453778ae3f 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_bf16_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 1, VLType::SME, false>( bfloat16 * &out, const bfloat16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "mov x21, %x[width]\n" @@ -168,9 +166,9 @@ void interleave_block<1, 1, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n" "add x12, x12, #0x1\n" "cmp x12, x11\n" "add x26, x26, #0x8\n" @@ -186,7 +184,7 @@ void interleave_block<1, 1, VLType::SME, false>( "cmp x12, x10\n" "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -206,4 +204,4 @@ void interleave_block<1, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp index d0375de76f..98bdcd2fa2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block2_bf16_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 2, VLType::SME, false>( bfloat16 * &out, const bfloat16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cnth x22\n" @@ -176,11 +174,11 @@ void interleave_block<1, 2, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25396140 // psel p0.h, p8.h/Z, p10.h[w13, #1]\n" "cmp x12, x10\n" - ".inst 0xe0562321 // ld1h { za0h.h[x13, #1] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0xe0562281 // ld1h { za0h.h[x13, #1] }, p0/Z, [x20, x22, LSL #1]\n" "add x26, x26, #0x8\n" "addvl x21, x21, #1\n" "add x13, x13, #0x2\n" @@ -197,7 +195,7 @@ void interleave_block<1, 2, VLType::SME, false>( "addvl x21, x21, #1\n" "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -217,4 +215,4 @@ void interleave_block<1, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp index 622d9aa4fc..4390bb7c7f 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 4, VLType::SME, false>( int8_t * &out, const int8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntb x21\n" @@ -179,11 +177,11 @@ void interleave_block<1, 4, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "cmp x12, x9\n" - ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n" "add x26, x26, #0x8\n" "addvl x21, x21, #1\n" "add x13, x13, #0x4\n" @@ -200,7 +198,7 @@ void interleave_block<1, 4, VLType::SME, false>( "addvl x21, x21, #1\n" "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -220,4 +218,4 @@ void interleave_block<1, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp index 07f03702d9..f5ee261964 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_s8_s8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 4, VLType::SME, true>( @@ -200,12 +200,12 @@ void interleave_block<1, 4, VLType::SME, true>( "10:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "ldr x22, [x23, #0x0]\n" + "ldr x20, [x23, #0x0]\n" ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "sdot z17.s, z16.b, z18.b\n" - ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n" "cmp x12, x9\n" "add x23, x23, #0x8\n" "addvl x24, x24, #1\n" @@ -225,7 +225,7 @@ void interleave_block<1, 4, VLType::SME, true>( "addvl x24, x24, #1\n" "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -249,4 +249,4 @@ void interleave_block<1, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp index 618570de08..76c1d053cd 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 4, VLType::SME, false>( uint8_t * &out, const uint8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntb x21\n" @@ -179,11 +177,11 @@ void interleave_block<1, 4, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "cmp x12, x9\n" - ".inst 0xe0162322 // ld1b { za0h.b[x13, #2] }, p0/Z, [x25, x22]\n" + ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n" "add x26, x26, #0x8\n" "addvl x21, x21, #1\n" "add x13, x13, #0x4\n" @@ -200,7 +198,7 @@ void interleave_block<1, 4, VLType::SME, false>( "addvl x21, x21, #1\n" "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x27, %x[width]\n" + "whilelt p8.b, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -220,4 +218,4 @@ void interleave_block<1, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp index 646db0caa8..daf2d3a100 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_block4_u8_u8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 4, VLType::SME, true>( @@ -200,12 +200,12 @@ void interleave_block<1, 4, VLType::SME, true>( "10:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8300 // st1w { za0v.s[x12] }, p0/Z, [x24, XZR, LSL #2]\n" - "ldr x22, [x23, #0x0]\n" + "ldr x20, [x23, #0x0]\n" ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" "add x12, x12, #0x1\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" "udot z17.s, z16.b, z18.b\n" - ".inst 0xe01922c2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x22, x25]\n" + ".inst 0xe0192282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x25]\n" "cmp x12, x9\n" "add x23, x23, #0x8\n" "addvl x24, x24, #1\n" @@ -225,7 +225,7 @@ void interleave_block<1, 4, VLType::SME, true>( "addvl x24, x24, #1\n" "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x28, %x[width]\n" + "whilelt p8.b, x28, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -249,4 +249,4 @@ void interleave_block<1, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp index 788c1a2eca..274f69f370 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp16_fp16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 1, VLType::SME, false>( __fp16 * &out, const __fp16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "mov x21, %x[width]\n" @@ -168,9 +166,9 @@ void interleave_block<1, 1, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25286d20 // psel p0.h, p11.h/Z, p9.h[w12]\n" ".inst 0xe07f82a0 // st1h { za0v.h[x12] }, p0/Z, [x21, XZR, LSL #1]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" ".inst 0x25286140 // psel p0.h, p8.h/Z, p10.h[w12]\n" - ".inst 0xe0560328 // ld1h { za1h.h[x12] }, p0/Z, [x25, x22, LSL #1]\n" + ".inst 0xe0560288 // ld1h { za1h.h[x12] }, p0/Z, [x20, x22, LSL #1]\n" "add x12, x12, #0x1\n" "cmp x12, x11\n" "add x26, x26, #0x8\n" @@ -186,7 +184,7 @@ void interleave_block<1, 1, VLType::SME, false>( "cmp x12, x10\n" "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.h, x27, %x[width]\n" + "whilelt p8.h, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -206,4 +204,4 @@ void interleave_block<1, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp index 7de88543d7..ab290649fd 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave1VL_fp32_fp32.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<1, 1, VLType::SME, false>( float * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "mov x22, %x[width]\n" @@ -167,9 +165,9 @@ void interleave_block<1, 1, VLType::SME, false>( "9:" // K loop: Tails: Even: First ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" - "ldr x25, [x26, #0x0]\n" + "ldr x20, [x26, #0x0]\n" ".inst 0x25306140 // psel p0.s, p8.s/Z, p10.s[w12]\n" - ".inst 0xe0960328 // ld1w { za2h.s[x12] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0960288 // ld1w { za2h.s[x12] }, p0/Z, [x20, x22, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x10\n" "add x26, x26, #0x8\n" @@ -185,7 +183,7 @@ void interleave_block<1, 1, VLType::SME, false>( "cmp x12, x9\n" "addvl x21, x21, #1\n" "blt 10b\n" - "whilelt p9.s, x27, %x[width]\n" + "whilelt p8.s, x27, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -205,4 +203,4 @@ void interleave_block<1, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp index 14ee5d6304..dc6d12b61e 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_bf16_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 1, VLType::SME, false>( bfloat16 * &out, const bfloat16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cnth x28\n" @@ -97,4 +95,4 @@ void interleave_block<2, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp index f648ccf771..d9189258c1 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_bf16_bf16.hpp @@ -22,32 +22,30 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 2, VLType::SME, false>( bfloat16 * &out, const bfloat16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cnth x21\n" - "mov x22, %x[width]\n" - "inch x22\n" + "cnth x22\n" + "mov x21, %x[width]\n" + "inch x21\n" "mov x20, %x[width]\n" - "sub x17, x21, #0x1\n" - "sub x22, x22, #0x1\n" + "sub x17, x22, #0x1\n" + "sub x21, x21, #0x1\n" "ands x17, x20, x17\n" "cntw x16\n" - "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL) - "csel x17, x17, x21, NE\n" - "sub x13, x22, #0x1\n" + "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x22, NE\n" + "sub x13, x21, #0x1\n" "add x17, x17, #0x1\n" "sub x15, x16, #0x2\n" - "lsl x21, %x[height], #0x1\n" // height * 2 + "lsl x22, %x[height], #0x1\n" // height * 2 "lsl x20, x16, #0x1\n" "mov x14, #0x0\n" "mov x11, %x[in]\n" @@ -57,15 +55,15 @@ void interleave_block<2, 2, VLType::SME, false>( "cntw x27, ALL, MUL #3\n" "ldr x26, [x10, #0x0]\n" "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1) "ldr x24, [x11, #0x8]\n" "lsr x17, x17, #0x1\n" "ptrue p13.s\n" - "ldr x23, [x10, #0x8]\n" - "whilelt p12.h, XZR, x21\n" - "whilelt p11.h, x20, x21\n" - "mov x22, %x[row_offset]\n" - "mov x21, %x[out]\n" + "ldr x21, [x10, #0x8]\n" + "whilelt p12.h, XZR, x22\n" + "whilelt p11.h, x20, x22\n" + "mov x23, %x[row_offset]\n" + "mov x22, %x[out]\n" "whilelt p10.h, x14, %x[width]\n" "whilelt p9.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" @@ -76,39 +74,39 @@ void interleave_block<2, 2, VLType::SME, false>( "1:" // K loop: Charge: Loop ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" "ldr x26, [x10, #0x0]\n" - ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" "add x11, x11, #0x10\n" - ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n" "add x12, x12, #0x4\n" "cmp x12, x15, LSL #1\n" - "ldr x23, [x10, #0x8]\n" + "ldr x21, [x10, #0x8]\n" "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" - ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" - "inch x22\n" + "inch x23\n" "inch x14\n" "ldr x24, [x11, #0x8]\n" "add x11, x11, #0x10\n" - "ldr x23, [x10, #0x8]\n" + "ldr x21, [x10, #0x8]\n" "add x10, x10, #0x10\n" "cbz x13, 8f\n" "mov x20, x13\n" @@ -121,60 +119,60 @@ void interleave_block<2, 2, VLType::SME, false>( "4:" // K loop: Main loop: First: Loop ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x15\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" "ldr x9, [x11, #0x0]\n" ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.h, x14, %x[width]\n" "inch x14\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" - "addvl x21, x21, #4\n" - "inch x22\n" + ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" + "addvl x22, x22, #4\n" + "inch x23\n" "whilelt p9.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" @@ -183,61 +181,61 @@ void interleave_block<2, 2, VLType::SME, false>( "6:" // K loop: Main loop: Second: Loop ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x15\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" "ldr x9, [x11, #0x0]\n" ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.h, x14, %x[width]\n" "subs x20, x20, #0x1\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" - "addvl x21, x21, #4\n" + ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" + "addvl x22, x22, #4\n" "inch x14\n" - "inch x22\n" + "inch x23\n" "bgt 3b\n" "8:" // K loop: Tails "cbnz x25, 11f\n" @@ -248,51 +246,51 @@ void interleave_block<2, 2, VLType::SME, false>( "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" - "ldr x9, [x11, #0x0]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" + "ldr x21, [x11, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" - "ldr x26, [x11, x16, LSL #0x3]\n" + "ldr x20, [x11, x16, LSL #0x3]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" "cmp x12, x16\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n" + ".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n" "add x11, x11, #0x8\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "add x13, x13, #0x2\n" "blt 9b\n" "whilelt p10.h, x14, %x[width]\n" - "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p10.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x21\n" + "mov %x[out], x22\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) @@ -300,4 +298,4 @@ void interleave_block<2, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp index 61536d38a5..ef787c89b9 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block2_fp16_fp16.hpp @@ -22,32 +22,30 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 2, VLType::SME, false>( __fp16 * &out, const __fp16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" - "cnth x21\n" - "mov x22, %x[width]\n" - "inch x22\n" + "cnth x22\n" + "mov x21, %x[width]\n" + "inch x21\n" "mov x20, %x[width]\n" - "sub x17, x21, #0x1\n" - "sub x22, x22, #0x1\n" + "sub x17, x22, #0x1\n" + "sub x21, x21, #0x1\n" "ands x17, x20, x17\n" "cntw x16\n" - "udiv x22, x22, x21\n" // n_passes = ceildiv(width, VL) - "csel x17, x17, x21, NE\n" - "sub x13, x22, #0x1\n" + "udiv x21, x21, x22\n" // n_passes = ceildiv(width, VL) + "csel x17, x17, x22, NE\n" + "sub x13, x21, #0x1\n" "add x17, x17, #0x1\n" "sub x15, x16, #0x2\n" - "lsl x21, %x[height], #0x1\n" // height * 2 + "lsl x22, %x[height], #0x1\n" // height * 2 "lsl x20, x16, #0x1\n" "mov x14, #0x0\n" "mov x11, %x[in]\n" @@ -57,15 +55,15 @@ void interleave_block<2, 2, VLType::SME, false>( "cntw x27, ALL, MUL #3\n" "ldr x26, [x10, #0x0]\n" "lsr x13, x13, #0x1\n" // n_loops = (n_passes - 1) / 2 - "and x25, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) + "and x25, x21, #0x1\n" // odd_tail = bool(n_passes & 0x1) "ldr x24, [x11, #0x8]\n" "lsr x17, x17, #0x1\n" "ptrue p13.s\n" - "ldr x23, [x10, #0x8]\n" - "whilelt p12.h, XZR, x21\n" - "whilelt p11.h, x20, x21\n" - "mov x22, %x[row_offset]\n" - "mov x21, %x[out]\n" + "ldr x21, [x10, #0x8]\n" + "whilelt p12.h, XZR, x22\n" + "whilelt p11.h, x20, x22\n" + "mov x23, %x[row_offset]\n" + "mov x22, %x[out]\n" "whilelt p10.h, x14, %x[width]\n" "whilelt p9.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" @@ -76,39 +74,39 @@ void interleave_block<2, 2, VLType::SME, false>( "1:" // K loop: Charge: Loop ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" "ldr x26, [x10, #0x0]\n" - ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" "add x11, x11, #0x10\n" - ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n" "add x12, x12, #0x4\n" "cmp x12, x15, LSL #1\n" - "ldr x23, [x10, #0x8]\n" + "ldr x21, [x10, #0x8]\n" "add x10, x10, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25286581 // psel p1.h, p9.h/Z, p12.h[w12]\n" ".inst 0x25286160 // psel p0.h, p8.h/Z, p11.h[w12]\n" - ".inst 0xe0560520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0560348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0570520 // ld1h { za0h.h[x12] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0570348 // ld1h { za1h.h[x12] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25686581 // psel p1.h, p9.h/Z, p12.h[w12, #2]\n" ".inst 0x25686160 // psel p0.h, p8.h/Z, p11.h[w12, #2]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" - ".inst 0xe0560702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0570702 // ld1h { za0h.h[x12, #2] }, p1/Z, [x24, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe05602ea // ld1h { za1h.h[x12, #2] }, p0/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05702aa // ld1h { za1h.h[x12, #2] }, p0/Z, [x21, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" - "inch x22\n" + "inch x23\n" "inch x14\n" "ldr x24, [x11, #0x8]\n" "add x11, x11, #0x10\n" - "ldr x23, [x10, #0x8]\n" + "ldr x21, [x10, #0x8]\n" "add x10, x10, #0x10\n" "cbz x13, 8f\n" "mov x20, x13\n" @@ -121,60 +119,60 @@ void interleave_block<2, 2, VLType::SME, false>( "4:" // K loop: Main loop: First: Loop ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" ".inst 0x25796162 // psel p2.h, p8.h/Z, p11.h[w13, #3]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562aeb // ld1h { za1h.h[x13, #3] }, p2/Z, [x23, x22, LSL #1]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0572aab // ld1h { za1h.h[x13, #3] }, p2/Z, [x21, x23, LSL #1]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x15\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0572349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x23, LSL #1]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" "ldr x9, [x11, #0x0]\n" ".inst 0x25796580 // psel p0.h, p9.h/Z, p12.h[w13, #3]\n" ".inst 0x25796161 // psel p1.h, p8.h/Z, p11.h[w13, #3]\n" - ".inst 0xe0562303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572303 // ld1h { za0h.h[x13, #3] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe05626eb // ld1h { za1h.h[x13, #3] }, p1/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05726ab // ld1h { za1h.h[x13, #3] }, p1/Z, [x21, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.h, x14, %x[width]\n" "inch x14\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - ".inst 0xe0bb82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" - "addvl x21, x21, #4\n" - "inch x22\n" + ".inst 0xe0bb82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" + "addvl x22, x22, #4\n" + "inch x23\n" "whilelt p9.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" "mov x13, #0x0\n" @@ -183,61 +181,61 @@ void interleave_block<2, 2, VLType::SME, false>( "6:" // K loop: Main loop: Second: Loop ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" + ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n" "ldr x9, [x11, #0x0]\n" - ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n" ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" ".inst 0x25696162 // psel p2.h, p8.h/Z, p11.h[w13, #2]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0562aea // ld1h { za1h.h[x13, #2] }, p2/Z, [x23, x22, LSL #1]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0572aaa // ld1h { za1h.h[x13, #2] }, p2/Z, [x21, x23, LSL #1]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x10, x10, #0x10\n" "add x13, x13, #0x4\n" - ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" + ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x15\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25296581 // psel p1.h, p9.h/Z, p12.h[w13]\n" ".inst 0x25296160 // psel p0.h, p8.h/Z, p11.h[w13]\n" - ".inst 0xe0562520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe0572520 // ld1h { za0h.h[x13] }, p1/Z, [x9, x23, LSL #1]\n" + ".inst 0xe0572348 // ld1h { za1h.h[x13] }, p0/Z, [x26, x23, LSL #1]\n" "mov x11, %x[in]\n" "add x10, %x[in], x16, LSL #3\n" "ldr x9, [x11, #0x0]\n" ".inst 0x25696580 // psel p0.h, p9.h/Z, p12.h[w13, #2]\n" ".inst 0x25696161 // psel p1.h, p8.h/Z, p11.h[w13, #2]\n" - ".inst 0xe0562302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x22, LSL #1]\n" + ".inst 0xe0572302 // ld1h { za0h.h[x13, #2] }, p0/Z, [x24, x23, LSL #1]\n" "ldr x26, [x10, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe05626ea // ld1h { za1h.h[x13, #2] }, p1/Z, [x23, x22, LSL #1]\n" + ".inst 0xe05726aa // ld1h { za1h.h[x13, #2] }, p1/Z, [x21, x23, LSL #1]\n" "ldr x24, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x10, #0x8]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x10, #0x8]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.h, x14, %x[width]\n" "subs x20, x20, #0x1\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - ".inst 0xe0bb82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x27, LSL #2]\n" - "addvl x21, x21, #4\n" + ".inst 0xe0bb82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x27, LSL #2]\n" + "addvl x22, x22, #4\n" "inch x14\n" - "inch x22\n" + "inch x23\n" "bgt 3b\n" "8:" // K loop: Tails "cbnz x25, 11f\n" @@ -248,51 +246,51 @@ void interleave_block<2, 2, VLType::SME, false>( "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" - "ldr x9, [x11, #0x0]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" + "ldr x21, [x11, #0x0]\n" "add x12, x12, #0x1\n" ".inst 0x25396581 // psel p1.h, p9.h/Z, p12.h[w13, #1]\n" - "ldr x26, [x11, x16, LSL #0x3]\n" + "ldr x20, [x11, x16, LSL #0x3]\n" ".inst 0x25396160 // psel p0.h, p8.h/Z, p11.h[w13, #1]\n" "cmp x12, x16\n" - ".inst 0xe0562521 // ld1h { za0h.h[x13, #1] }, p1/Z, [x9, x22, LSL #1]\n" - ".inst 0xe0562349 // ld1h { za1h.h[x13, #1] }, p0/Z, [x26, x22, LSL #1]\n" + ".inst 0xe05726a1 // ld1h { za0h.h[x13, #1] }, p1/Z, [x21, x23, LSL #1]\n" + ".inst 0xe0572289 // ld1h { za1h.h[x13, #1] }, p0/Z, [x20, x23, LSL #1]\n" "add x11, x11, #0x8\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "add x13, x13, #0x2\n" "blt 9b\n" "whilelt p10.h, x14, %x[width]\n" - "whilelt p9.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "whilelt p8.h, x14, %x[width]\n" "mov x20, #0x0\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "add x20, x20, #0x2\n" "blt 10b\n" - "whilelt p10.h, x14, %x[width]\n" + "whilelt p8.h, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x21\n" + "mov %x[out], x22\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) @@ -300,4 +298,4 @@ void interleave_block<2, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp index 4c701cff19..905c6b41eb 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 4, VLType::SME, false>( int8_t * &out, const int8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntb x21\n" @@ -248,13 +246,13 @@ void interleave_block<2, 4, VLType::SME, false>( ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" - "ldr x9, [x11, #0x0]\n" + "ldr x20, [x11, #0x0]\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" - "ldr x26, [x11, x16, LSL #0x3]\n" + ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n" + "ldr x20, [x11, x16, LSL #0x3]\n" "add x12, x12, #0x1\n" ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n" "cmp x12, x16\n" "add x11, x11, #0x8\n" "addvl x21, x21, #2\n" @@ -274,7 +272,7 @@ void interleave_block<2, 4, VLType::SME, false>( "addvl x21, x21, #2\n" "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -296,4 +294,4 @@ void interleave_block<2, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp index 25262d3db9..c5c5af20e2 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_s8_s8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 4, VLType::SME, true>( @@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" - ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" - "sdot z19.s, z16.b, z20.b\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + "sdot z19.s, z17.b, z20.b\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "sdot z19.s, z16.b, z20.b\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 5b\n" @@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z16.b, z20.b\n" - ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + "sdot z18.s, z16.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "incb x15\n" "add x26, x26, #0x10\n" - "sdot z19.s, z16.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "incb x28\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" - ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" - "sdot z19.s, z16.b, z20.b\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" + "sdot z19.s, z17.b, z20.b\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "sdot z19.s, z16.b, z20.b\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 7b\n" @@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "sdot z19.s, z16.b, z20.b\n" - ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" + "sdot z18.s, z16.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "subs x20, x20, #0x1\n" "add x26, x26, #0x10\n" - "sdot z19.s, z16.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "incb x15\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "ldr x24, [x26, #0x0]\n" + "ldr x21, [x26, #0x0]\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" - "ldr x23, [x26, x16, LSL #0x3]\n" - ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + "ldr x20, [x26, x16, LSL #0x3]\n" + ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n" "add x12, x12, #0x1\n" ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" "cmp x12, x16\n" - "sdot z19.s, z16.b, z20.b\n" - "sdot z18.s, z17.b, z20.b\n" - ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + "sdot z19.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" + ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n" "add x26, x26, #0x8\n" "addvl x27, x27, #2\n" "add x13, x13, #0x4\n" @@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "sdot z19.s, z16.b, z20.b\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "addvl x27, x27, #2\n" "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "sdot z19.s, z16.b, z20.b\n" - "sdot z18.s, z17.b, z20.b\n" + "sdot z19.s, z17.b, z20.b\n" + "sdot z18.s, z16.b, z20.b\n" "addvl x27, x27, #2\n" "blt 13b\n" "14:" // K loop: End @@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp index 683a315a96..ce9a0065c7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 4, VLType::SME, false>( uint8_t * &out, const uint8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntb x21\n" @@ -248,13 +246,13 @@ void interleave_block<2, 4, VLType::SME, false>( ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" - "ldr x9, [x11, #0x0]\n" + "ldr x20, [x11, #0x0]\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xe0162122 // ld1b { za0h.b[x13, #2] }, p0/Z, [x9, x22]\n" - "ldr x26, [x11, x16, LSL #0x3]\n" + ".inst 0xe0162282 // ld1b { za0h.b[x13, #2] }, p0/Z, [x20, x22]\n" + "ldr x20, [x11, x16, LSL #0x3]\n" "add x12, x12, #0x1\n" ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" - ".inst 0xe0162343 // ld1b { za0h.b[x13, #3] }, p0/Z, [x26, x22]\n" + ".inst 0xe0162283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x22]\n" "cmp x12, x16\n" "add x11, x11, #0x8\n" "addvl x21, x21, #2\n" @@ -274,7 +272,7 @@ void interleave_block<2, 4, VLType::SME, false>( "addvl x21, x21, #2\n" "add x20, x20, #0x4\n" "blt 10b\n" - "whilelt p9.b, x14, %x[width]\n" + "whilelt p8.b, x14, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -296,4 +294,4 @@ void interleave_block<2, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp index e7571f7da7..7805152656 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_block4_u8_u8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 4, VLType::SME, true>( @@ -140,23 +140,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa7 // ld1b { za0h.b[x13, #7] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8760 // st1w { za0v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" ".inst 0xe0ae8361 // st1w { za0v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 5b\n" @@ -172,28 +172,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a7 // ld1b { za0h.b[x13, #7] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "udot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829031 // mova z17.s, p4/M, za0v.s[x12, #1]\n" + ".inst 0xc0829030 // mova z16.s, p4/M, za0v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f60 // st1w { za0v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08290b0 // mova z16.s, p4/M, za1v.s[x12, #1]\n" + ".inst 0xc08290b1 // mova z17.s, p4/M, za1v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b64 // st1w { za1v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "incb x15\n" "add x26, x26, #0x10\n" - "udot z19.s, z17.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0ae8761 // st1w { za0v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "incb x28\n" ".inst 0xe0ab8365 // st1w { za1v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -217,23 +217,23 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0xe01c2aa5 // ld1b { za0h.b[x13, #5] }, p2/Z, [x21, x28]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8768 // st1w { za2v.s[x12] }, p1/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" - "udot z19.s, z17.b, z20.b\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" ".inst 0xe0ae8369 // st1w { za2v.s[x12, #1] }, p0/Z, [x27, x14, LSL #2]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" - ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" "add x12, x12, #0x2\n" "cmp x12, x9\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #4\n" "add x13, x13, #0x8\n" "blt 7b\n" @@ -249,28 +249,28 @@ void interleave_block<2, 4, VLType::SME, true>( "add x25, %x[in], x16, LSL #3\n" "ldr x24, [x26, #0x0]\n" ".inst 0xe01c22a5 // ld1b { za0h.b[x13, #5] }, p0/Z, [x21, x28]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" ".inst 0x25306d23 // psel p3.s, p11.s/Z, p9.s[w12]\n" - "udot z19.s, z17.b, z20.b\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" + "udot z18.s, z17.b, z20.b\n" "ldr x23, [x25, #0x0]\n" ".inst 0x25306d22 // psel p2.s, p11.s/Z, p9.s[w12]\n" "ldr x22, [x26, #0x8]\n" ".inst 0x25706d21 // psel p1.s, p11.s/Z, p9.s[w12, #1]\n" - ".inst 0xc0829131 // mova z17.s, p4/M, za2v.s[x12, #1]\n" + ".inst 0xc0829130 // mova z16.s, p4/M, za2v.s[x12, #1]\n" ".inst 0x25706d20 // psel p0.s, p11.s/Z, p9.s[w12, #1]\n" "ldr x21, [x25, #0x8]\n" ".inst 0xe0bf8f68 // st1w { za2v.s[x12] }, p3/Z, [x27, XZR, LSL #2]\n" - ".inst 0xc08291b0 // mova z16.s, p4/M, za3v.s[x12, #1]\n" + ".inst 0xc08291b1 // mova z17.s, p4/M, za3v.s[x12, #1]\n" "whilelt p9.b, x15, %x[width]\n" ".inst 0xe0b08b6c // st1w { za3v.s[x12] }, p2/Z, [x27, x16, LSL #2]\n" "subs x20, x20, #0x1\n" "add x26, x26, #0x10\n" - "udot z19.s, z17.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" ".inst 0xe0ae8769 // st1w { za2v.s[x12, #1] }, p1/Z, [x27, x14, LSL #2]\n" "add x25, x25, #0x10\n" - "udot z18.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "incb x15\n" ".inst 0xe0ab836d // st1w { za3v.s[x12, #1] }, p0/Z, [x27, x11, LSL #2]\n" "addvl x27, x27, #4\n" @@ -286,19 +286,19 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - "ldr x24, [x26, #0x0]\n" + "ldr x21, [x26, #0x0]\n" ".inst 0x25356140 // psel p0.b, p8.b/Z, p10.b[w13, #2]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" - "ldr x23, [x26, x16, LSL #0x3]\n" - ".inst 0xe01c2302 // ld1b { za0h.b[x13, #2] }, p0/Z, [x24, x28]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" + "ldr x20, [x26, x16, LSL #0x3]\n" + ".inst 0xe01c22a2 // ld1b { za0h.b[x13, #2] }, p0/Z, [x21, x28]\n" "add x12, x12, #0x1\n" ".inst 0x253d6140 // psel p0.b, p8.b/Z, p10.b[w13, #3]\n" "cmp x12, x16\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" - ".inst 0xe01c22e3 // ld1b { za0h.b[x13, #3] }, p0/Z, [x23, x28]\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" + ".inst 0xe01c2283 // ld1b { za0h.b[x13, #3] }, p0/Z, [x20, x28]\n" "add x26, x26, #0x8\n" "addvl x27, x27, #2\n" "add x13, x13, #0x4\n" @@ -311,17 +311,17 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8368 // st1w { za2v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829111 // mova z17.s, p4/M, za2v.s[x12]\n" + ".inst 0xc0829110 // mova z16.s, p4/M, za2v.s[x12]\n" ".inst 0xe0b0836c // st1w { za3v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829190 // mova z16.s, p4/M, za3v.s[x12]\n" + ".inst 0xc0829191 // mova z17.s, p4/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #2\n" "add x20, x20, #0x4\n" "blt 11b\n" - "whilelt p9.b, x15, %x[width]\n" + "whilelt p8.b, x15, %x[width]\n" "b 14f\n" "12:" // K loop: Tails: Odd "mov x12, #0x0\n" @@ -329,13 +329,13 @@ void interleave_block<2, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8360 // st1w { za0v.s[x12] }, p0/Z, [x27, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0829011 // mova z17.s, p4/M, za0v.s[x12]\n" + ".inst 0xc0829010 // mova z16.s, p4/M, za0v.s[x12]\n" ".inst 0xe0b08364 // st1w { za1v.s[x12] }, p0/Z, [x27, x16, LSL #2]\n" - ".inst 0xc0829090 // mova z16.s, p4/M, za1v.s[x12]\n" + ".inst 0xc0829091 // mova z17.s, p4/M, za1v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x17\n" - "udot z19.s, z17.b, z20.b\n" - "udot z18.s, z16.b, z20.b\n" + "udot z19.s, z16.b, z20.b\n" + "udot z18.s, z17.b, z20.b\n" "addvl x27, x27, #2\n" "blt 13b\n" "14:" // K loop: End @@ -350,4 +350,4 @@ void interleave_block<2, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp index 522f310cc0..96ab55ee06 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp16_fp16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 1, VLType::SME, false>( __fp16 * &out, const __fp16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cnth x28\n" @@ -97,4 +95,4 @@ void interleave_block<2, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp index 949e003598..ac4b1b5086 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave2VL_fp32_fp32.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<2, 1, VLType::SME, false>( float * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "mov x22, %x[width]\n" @@ -55,12 +53,12 @@ void interleave_block<2, 1, VLType::SME, false>( "ldr x25, [x11, #0x8]\n" "and x24, x22, #0x1\n" // odd_tail = bool(n_passes & 0x1) "csel x15, x15, x16, NE\n" - "ldr x23, [x9, #0x8]\n" + "ldr x21, [x9, #0x8]\n" "ptrue p13.s\n" "whilelt p12.s, XZR, %x[height]\n" "whilelt p11.s, x16, %x[height]\n" - "mov x22, %x[row_offset]\n" - "mov x21, %x[out]\n" + "mov x23, %x[row_offset]\n" + "mov x22, %x[out]\n" "whilelt p10.s, x13, %x[width]\n" "whilelt p9.s, x13, %x[width]\n" "whilelt p8.s, x13, %x[width]\n" @@ -71,39 +69,39 @@ void interleave_block<2, 1, VLType::SME, false>( "1:" // K loop: Charge: Loop ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" "ldr x10, [x11, #0x0]\n" - ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n" "ldr x27, [x9, #0x0]\n" - ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n" "ldr x25, [x11, #0x8]\n" "add x11, x11, #0x10\n" - ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" + ".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x14\n" - "ldr x23, [x9, #0x8]\n" + "ldr x21, [x9, #0x8]\n" "add x9, x9, #0x10\n" "blt 1b\n" "2:" // K loop: Charge: End ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" - ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" + ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" ".inst 0x25706581 // psel p1.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706160 // psel p0.s, p8.s/Z, p11.s[w12, #1]\n" "mov x11, %x[in]\n" "add x9, %x[in], x16, LSL #3\n" - ".inst 0xe0960721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970721 // ld1w { za0h.s[x12, #1] }, p1/Z, [x25, x23, LSL #2]\n" "ldr x10, [x11, #0x0]\n" - ".inst 0xe09602e5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x23, x22, LSL #2]\n" + ".inst 0xe09702a5 // ld1w { za1h.s[x12, #1] }, p0/Z, [x21, x23, LSL #2]\n" "ldr x27, [x9, #0x0]\n" - "incw x22\n" + "incw x23\n" "incw x13\n" "ldr x25, [x11, #0x8]\n" "add x11, x11, #0x10\n" - "ldr x23, [x9, #0x8]\n" + "ldr x21, [x9, #0x8]\n" "add x9, x9, #0x10\n" "cbz x20, 8f\n" "mov x20, x20\n" @@ -115,59 +113,59 @@ void interleave_block<2, 1, VLType::SME, false>( "4:" // K loop: Main loop: First: Loop ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" "ldr x10, [x11, #0x0]\n" - ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n" "ldr x27, [x9, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n" "ldr x25, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0960aed // ld1w { za3h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n" - "ldr x23, [x9, #0x8]\n" - ".inst 0xe0bf86a0 // st1w { za0v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0970aad // ld1w { za3h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n" + "ldr x21, [x9, #0x8]\n" + ".inst 0xe0bf86c0 // st1w { za0v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x9, x9, #0x10\n" - ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" + ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x14\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 4b\n" "5:" // K loop: Main loop: First: Tail ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" - ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe0970548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" + ".inst 0xe097036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" "mov x11, %x[in]\n" "add x9, %x[in], x16, LSL #3\n" "ldr x10, [x11, #0x0]\n" ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe0960329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970329 // ld1w { za2h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n" "ldr x27, [x9, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe09606ed // ld1w { za3h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + ".inst 0xe09706ad // ld1w { za3h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n" "ldr x25, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x9, #0x8]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x9, #0x8]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aa4 // st1w { za1v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08ac4 // st1w { za1v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.s, x13, %x[width]\n" "incw x13\n" - ".inst 0xe0bc86a1 // st1w { za0v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c1 // st1w { za0v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x9, x9, #0x10\n" - ".inst 0xe0ba82a5 // st1w { za1v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" - "addvl x21, x21, #4\n" - "incw x22\n" + ".inst 0xe0ba82c5 // st1w { za1v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n" + "addvl x22, x22, #4\n" + "incw x23\n" "whilelt p9.s, x13, %x[width]\n" "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" @@ -175,60 +173,60 @@ void interleave_block<2, 1, VLType::SME, false>( "6:" // K loop: Main loop: Second: Loop ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" "ldr x10, [x11, #0x0]\n" - ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706162 // psel p2.s, p8.s/Z, p11.s[w12, #1]\n" "ldr x27, [x9, #0x0]\n" ".inst 0x25307541 // psel p1.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n" "ldr x25, [x11, #0x8]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0960ae5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x23, x22, LSL #2]\n" - "ldr x23, [x9, #0x8]\n" - ".inst 0xe0bf86a8 // st1w { za2v.s[x12] }, p1/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0970aa5 // ld1w { za1h.s[x12, #1] }, p2/Z, [x21, x23, LSL #2]\n" + "ldr x21, [x9, #0x8]\n" + ".inst 0xe0bf86c8 // st1w { za2v.s[x12] }, p1/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" "add x11, x11, #0x10\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x9, x9, #0x10\n" - ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" + ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n" "add x12, x12, #0x2\n" "cmp x12, x14\n" - "addvl x21, x21, #4\n" + "addvl x22, x22, #4\n" "blt 6b\n" "7:" // K loop: Main loop: Second: Tail ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - ".inst 0xe0960540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" - ".inst 0xe0960364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + ".inst 0xe0970540 // ld1w { za0h.s[x12] }, p1/Z, [x10, x23, LSL #2]\n" + ".inst 0xe0970364 // ld1w { za1h.s[x12] }, p0/Z, [x27, x23, LSL #2]\n" "mov x11, %x[in]\n" "add x9, %x[in], x16, LSL #3\n" "ldr x10, [x11, #0x0]\n" ".inst 0x25706580 // psel p0.s, p9.s/Z, p12.s[w12, #1]\n" ".inst 0x25706161 // psel p1.s, p8.s/Z, p11.s[w12, #1]\n" - ".inst 0xe0960321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x22, LSL #2]\n" + ".inst 0xe0970321 // ld1w { za0h.s[x12, #1] }, p0/Z, [x25, x23, LSL #2]\n" "ldr x27, [x9, #0x0]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe09606e5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x23, x22, LSL #2]\n" + ".inst 0xe09706a5 // ld1w { za1h.s[x12, #1] }, p1/Z, [x21, x23, LSL #2]\n" "ldr x25, [x11, #0x8]\n" ".inst 0x25307542 // psel p2.s, p13.s/Z, p10.s[w12]\n" - "ldr x23, [x9, #0x8]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + "ldr x21, [x9, #0x8]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25707541 // psel p1.s, p13.s/Z, p10.s[w12, #1]\n" ".inst 0x25707540 // psel p0.s, p13.s/Z, p10.s[w12, #1]\n" - ".inst 0xe0b08aac // st1w { za3v.s[x12] }, p2/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b08acc // st1w { za3v.s[x12] }, p2/Z, [x22, x16, LSL #2]\n" "whilelt p10.s, x13, %x[width]\n" "subs x20, x20, #0x1\n" - ".inst 0xe0bc86a9 // st1w { za2v.s[x12, #1] }, p1/Z, [x21, x28, LSL #2]\n" + ".inst 0xe0bc86c9 // st1w { za2v.s[x12, #1] }, p1/Z, [x22, x28, LSL #2]\n" "add x11, x11, #0x10\n" "add x9, x9, #0x10\n" - ".inst 0xe0ba82ad // st1w { za3v.s[x12, #1] }, p0/Z, [x21, x26, LSL #2]\n" - "addvl x21, x21, #4\n" + ".inst 0xe0ba82cd // st1w { za3v.s[x12, #1] }, p0/Z, [x22, x26, LSL #2]\n" + "addvl x22, x22, #4\n" "incw x13\n" - "incw x22\n" + "incw x23\n" "bgt 3b\n" "8:" // K loop: Tails "cbnz x24, 11f\n" @@ -238,48 +236,48 @@ void interleave_block<2, 1, VLType::SME, false>( "mov x12, #0x0\n" "9:" // K loop: Tails: Even: First ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" - "ldr x10, [x11, #0x0]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" + "ldr x21, [x11, #0x0]\n" ".inst 0x25306581 // psel p1.s, p9.s/Z, p12.s[w12]\n" ".inst 0x25306160 // psel p0.s, p8.s/Z, p11.s[w12]\n" - "ldr x27, [x11, x16, LSL #0x3]\n" - ".inst 0xe0960548 // ld1w { za2h.s[x12] }, p1/Z, [x10, x22, LSL #2]\n" + "ldr x20, [x11, x16, LSL #0x3]\n" + ".inst 0xe09706a8 // ld1w { za2h.s[x12] }, p1/Z, [x21, x23, LSL #2]\n" "add x11, x11, #0x8\n" - "addvl x21, x21, #2\n" - ".inst 0xe096036c // ld1w { za3h.s[x12] }, p0/Z, [x27, x22, LSL #2]\n" + "addvl x22, x22, #2\n" + ".inst 0xe097028c // ld1w { za3h.s[x12] }, p0/Z, [x20, x23, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x16\n" "blt 9b\n" "whilelt p10.s, x13, %x[width]\n" - "whilelt p9.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "whilelt p8.s, x13, %x[width]\n" "mov x12, #0x0\n" "10:" // K loop: Tails: Even: Second ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a8 // st1w { za2v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c8 // st1w { za2v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082ac // st1w { za3v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082cc // st1w { za3v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x15\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "blt 10b\n" - "whilelt p10.s, x13, %x[width]\n" + "whilelt p8.s, x13, %x[width]\n" "b 13f\n" "11:" // K loop: Tails: Odd "mov x12, #0x0\n" "12:" // K loop: Tails: Odd: Loop ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0bf82a0 // st1w { za0v.s[x12] }, p0/Z, [x21, XZR, LSL #2]\n" + ".inst 0xe0bf82c0 // st1w { za0v.s[x12] }, p0/Z, [x22, XZR, LSL #2]\n" ".inst 0x25307540 // psel p0.s, p13.s/Z, p10.s[w12]\n" - ".inst 0xe0b082a4 // st1w { za1v.s[x12] }, p0/Z, [x21, x16, LSL #2]\n" + ".inst 0xe0b082c4 // st1w { za1v.s[x12] }, p0/Z, [x22, x16, LSL #2]\n" "add x12, x12, #0x1\n" "cmp x12, x15\n" - "addvl x21, x21, #2\n" + "addvl x22, x22, #2\n" "blt 12b\n" "13:" // K loop: End - "mov %x[out], x21\n" + "mov %x[out], x22\n" ".inst 0xd503467f // SMSTOP\n" : [out] "+&r" (out) : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset), [width] "r" (width) @@ -287,4 +285,4 @@ void interleave_block<2, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp index 4cc84d344a..2e53475b5c 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block2_bf16_bf16.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 2, VLType::SME, false>( bfloat16 * &out, const bfloat16 * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x16\n" @@ -124,4 +122,4 @@ void interleave_block<4, 2, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp index 465939c30d..67dd5a9bb7 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 4, VLType::SME, false>( int8_t * &out, const int8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x16\n" @@ -123,4 +121,4 @@ void interleave_block<4, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp index ffd9384a13..21d9378368 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_s8_s8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 4, VLType::SME, true>( @@ -112,22 +112,22 @@ void interleave_block<4, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828812 // mova z18.s, p2/M, za0v.s[x12]\n" + ".inst 0xc0828811 // mova z17.s, p2/M, za0v.s[x12]\n" ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n" ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n" + ".inst 0xc0828893 // mova z19.s, p2/M, za1v.s[x12]\n" ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n" ".inst 0xc0828910 // mova z16.s, p2/M, za2v.s[x12]\n" - "sdot z23.s, z18.b, z24.b\n" + "sdot z23.s, z17.b, z24.b\n" ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n" - ".inst 0xc0828993 // mova z19.s, p2/M, za3v.s[x12]\n" + ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" - "sdot z22.s, z17.b, z24.b\n" + "sdot z22.s, z19.b, z24.b\n" "sdot z21.s, z16.b, z24.b\n" "addvl x9, x9, #4\n" - "sdot z20.s, z19.b, z24.b\n" + "sdot z20.s, z18.b, z24.b\n" "blt 5b\n" "incb x28\n" "whilelt p9.b, x28, %x[width]\n" @@ -147,4 +147,4 @@ void interleave_block<4, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp index 9f5db6ba3d..f149c93293 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 4, VLType::SME, false>( uint8_t * &out, const uint8_t * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x16\n" @@ -123,4 +121,4 @@ void interleave_block<4, 4, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp index 49d2acf1cd..252152e3da 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_block4_u8_u8_summing.hpp @@ -22,7 +22,7 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 4, VLType::SME, true>( @@ -112,22 +112,22 @@ void interleave_block<4, 4, VLType::SME, true>( ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xe0bf8120 // st1w { za0v.s[x12] }, p0/Z, [x9, XZR, LSL #2]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" - ".inst 0xc0828813 // mova z19.s, p2/M, za0v.s[x12]\n" + ".inst 0xc0828810 // mova z16.s, p2/M, za0v.s[x12]\n" ".inst 0xe0af8124 // st1w { za1v.s[x12] }, p0/Z, [x9, x15, LSL #2]\n" ".inst 0x25306d21 // psel p1.s, p11.s/Z, p9.s[w12]\n" ".inst 0x25306d20 // psel p0.s, p11.s/Z, p9.s[w12]\n" ".inst 0xc0828891 // mova z17.s, p2/M, za1v.s[x12]\n" ".inst 0xe0ae8528 // st1w { za2v.s[x12] }, p1/Z, [x9, x14, LSL #2]\n" - ".inst 0xc0828912 // mova z18.s, p2/M, za2v.s[x12]\n" - "udot z23.s, z19.b, z24.b\n" + ".inst 0xc0828913 // mova z19.s, p2/M, za2v.s[x12]\n" + "udot z23.s, z16.b, z24.b\n" ".inst 0xe0ad812c // st1w { za3v.s[x12] }, p0/Z, [x9, x13, LSL #2]\n" - ".inst 0xc0828990 // mova z16.s, p2/M, za3v.s[x12]\n" + ".inst 0xc0828992 // mova z18.s, p2/M, za3v.s[x12]\n" "add x12, x12, #0x1\n" "cmp x12, x20\n" "udot z22.s, z17.b, z24.b\n" - "udot z21.s, z18.b, z24.b\n" + "udot z21.s, z19.b, z24.b\n" "addvl x9, x9, #4\n" - "udot z20.s, z16.b, z24.b\n" + "udot z20.s, z18.b, z24.b\n" "blt 5b\n" "incb x28\n" "whilelt p9.b, x28, %x[width]\n" @@ -147,4 +147,4 @@ void interleave_block<4, 4, VLType::SME, true>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp index 9579263204..b11bb93c42 100644 --- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/sme_interleave4VL_fp32_fp32.hpp @@ -22,16 +22,14 @@ * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SME) template <> void interleave_block<4, 1, VLType::SME, false>( float * &out, const float * const *in, - size_t width, size_t height, size_t row_offset, bool first + size_t width, size_t height, size_t row_offset, bool ) { - ARM_COMPUTE_UNUSED(first); - __asm__ __volatile__( ".inst 0xd503477f // SMSTART ZA\n" "cntw x15\n" @@ -123,4 +121,4 @@ void interleave_block<4, 1, VLType::SME, false>( ); } -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp index 4f25da2877..b921fd16d2 100644 --- a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,8 +39,12 @@ */ template void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) { +#ifdef ARM_COMPUTE_ENABLE_SVE const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : (vlt == VLType::SME ? sme::get_vector_length() / block : 1 )); +#else + const unsigned int int_by = height_vectors; +#endif std::vector the_sums; @@ -104,8 +108,12 @@ void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t template inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) { +#ifdef ARM_COMPUTE_ENABLE_SVE const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : (vlt == VLType::SME ? sme::get_vector_length() / block : 1 )); +#else + const unsigned int height = height_vectors; +#endif // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not. if (row_sum_multiplier) { @@ -138,8 +146,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { +#ifdef ARM_COMPUTE_ENABLE_SVE const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : (vlt == VLType::SME ? sme::get_vector_length() / block : 1 )); +#else + const unsigned int height = height_vectors; +#endif // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for @@ -208,8 +220,12 @@ void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int template void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { +#ifdef ARM_COMPUTE_ENABLE_SVE const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : (vlt == VLType::SME ? sme::get_vector_length() / block : 1 )); +#else + const unsigned int height = height_vectors; +#endif auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen); // Use alloca here as a std::vector can be expensive in highly threaded scenarios. @@ -246,8 +262,12 @@ void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const con template void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { +#ifdef ARM_COMPUTE_ENABLE_SVE const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length() / block : (vlt == VLType::SME ? sme::get_vector_length() / block : 1 )); +#else + const unsigned int height = height_vectors; +#endif // Use alloca here as a std::vector can be expensive in highly threaded scenarios. const TIn **row_ptrs = reinterpret_cast(alloca(height * sizeof(const TIn *))); diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp index 9a871d4b88..72e414969e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp index 74791f8d30..377daddae9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_bf16fp32_mmla_6x16/generic.cpp @@ -231,11 +231,11 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "17:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 18f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -251,41 +251,41 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x12, #0x10]\n" "blt 21f\n" "20:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + "trn1 v20.2d, v1.2d, v21.2d\n" + ".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n" + "ldr q17, [x11, #0x0]\n" + ".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n" + "ldr q19, [x11, #0x10]\n" + ".inst 0x6e51ee89 // bfmmla v9.4s, v20.8h, v17.8h\n" + "ldr q18, [x10, #0x0]\n" + ".inst 0x6e53ee8d // bfmmla v13.4s, v20.8h, v19.8h\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee8a // bfmmla v10.4s, v20.8h, v18.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n" + "ldr q17, [x9, #0x10]\n" + "trn2 v1.2d, v1.2d, v21.2d\n" + ".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n" + "ldr q18, [x12, #0x20]\n" + ".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n" + "ldr q17, [x12, #0x30]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x11, #0x20]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x11, #0x30]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x9, #0x20]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x9, #0x30]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "ldr q1, [x26, #0x0]\n" "add x12, x12, #0x40\n" "ldr q7, [x12, #0x0]\n" @@ -295,39 +295,39 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "add x9, x9, #0x40\n" "bge 20b\n" "21:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + "trn1 v19.2d, v1.2d, v20.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q17, [x11, #0x0]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q18, [x11, #0x10]\n" + ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x0]\n" + ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x10]\n" + ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n" + "ldr q17, [x9, #0x0]\n" + ".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n" + "ldr q24, [x9, #0x10]\n" + "trn2 v1.2d, v1.2d, v20.2d\n" + ".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n" + "ldr q18, [x12, #0x20]\n" + ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n" + "ldr q17, [x12, #0x30]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q19, [x11, #0x20]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x11, #0x30]\n" + ".inst 0x6e53ec29 // bfmmla v9.4s, v1.8h, v19.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x9, #0x20]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x9, #0x30]\n" "sub x27, x27, #0x8\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "add x26, x26, #0x10\n" "add x12, x12, #0x40\n" "add x11, x11, #0x40\n" @@ -338,26 +338,26 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "cmp x27, #0x4\n" "blt 24f\n" "23:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x12, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" + "ldr d19, [x26], #0x8\n" + "ldr q18, [x12, #0x0]\n" + "trn1 v19.2d, v19.2d, v17.2d\n" + "ldr q17, [x12, #0x10]\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + "ldr q18, [x11, #0x0]\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q17, [x11, #0x10]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x0]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x9, #0x10]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -373,23 +373,23 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr h1, [x26, #0x0]\n" "26:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "ldr q6, [x12, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q20, [x12, #0x0]\n" + "ldr q18, [x12, #0x10]\n" + "trn1 v19.2d, v1.2d, v17.2d\n" + ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n" + "ldr q17, [x11, #0x0]\n" + ".inst 0x6e52ee6c // bfmmla v12.4s, v19.8h, v18.8h\n" + "ldr q18, [x11, #0x10]\n" + ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x0]\n" + ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n" "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e46ee6e // bfmmla v14.4s, v19.8h, v6.8h\n" + "ldr q17, [x9, #0x10]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -405,17 +405,17 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "uzp1 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 28f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "28:" // Height 1: No activation "cmp x14, #0x10\n" "bge 37f\n" @@ -624,12 +624,12 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "55:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 56f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 57f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -637,7 +637,7 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "b 57f\n" "56:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "57:" // Height 2: input setup done "cmp x27, #0x8\n" "blt 60f\n" @@ -648,45 +648,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x12, #0x10]\n" "blt 59f\n" "58:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q18, [x11, #0x0]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q17, [x11, #0x10]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x0]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x9, #0x10]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + "ldr q18, [x12, #0x20]\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" + "ldr q17, [x12, #0x30]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x11, #0x20]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x11, #0x30]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x9, #0x20]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x9, #0x30]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" "add x12, x12, #0x40\n" "ldr q7, [x12, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x12, #0x10]\n" "add x11, x11, #0x40\n" @@ -694,39 +694,39 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "add x9, x9, #0x40\n" "bge 58b\n" "59:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q18, [x11, #0x0]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q17, [x11, #0x10]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x0]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x9, #0x10]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + "ldr q18, [x12, #0x20]\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" + "ldr q17, [x12, #0x30]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x11, #0x20]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x11, #0x30]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x9, #0x20]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x9, #0x30]\n" "sub x27, x27, #0x8\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "add x12, x12, #0x40\n" @@ -738,27 +738,27 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "cmp x27, #0x4\n" "blt 62f\n" "61:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d18, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "trn1 v19.2d, v18.2d, v17.2d\n" "sub x27, x27, #0x4\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q6, [x11, #0x0]\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" + "ldr q18, [x12, #0x0]\n" + "ldr q17, [x12, #0x10]\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q26, [x11, #0x0]\n" + "ldr q6, [x11, #0x10]\n" + ".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n" + ".inst 0x6e46ee6d // bfmmla v13.4s, v19.8h, v6.8h\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q18, [x9, #0x0]\n" + "ldr q17, [x9, #0x10]\n" "cmp x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -777,23 +777,23 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr h1, [x26, #0x0]\n" "ldr h2, [x25, #0x0]\n" "64:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "ldr q6, [x12, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q18, [x12, #0x0]\n" + "ldr q17, [x12, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + "ldr q18, [x11, #0x0]\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q17, [x11, #0x10]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q3, [x10, #0x0]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q27, [x10, #0x10]\n" + ".inst 0x6e43ee6a // bfmmla v10.4s, v19.8h, v3.8h\n" + "ldr q18, [x9, #0x0]\n" + ".inst 0x6e5bee6e // bfmmla v14.4s, v19.8h, v27.8h\n" + "ldr q17, [x9, #0x10]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -815,25 +815,25 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "uzp2 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 66f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v18.4s\n" + "fmin v12.4s, v12.4s, v18.4s\n" + "fmin v13.4s, v13.4s, v18.4s\n" + "fmin v14.4s, v14.4s, v18.4s\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v7.4s, v7.4s, v17.4s\n" + "fmax v12.4s, v12.4s, v17.4s\n" + "fmax v13.4s, v13.4s, v17.4s\n" + "fmax v14.4s, v14.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "66:" // Height 2: No activation "cmp x14, #0x10\n" "bge 75f\n" @@ -1107,13 +1107,13 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "93:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 94f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 95f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1122,8 +1122,8 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "b 95f\n" "94:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "95:" // Height 3: input setup done "cmp x27, #0x8\n" "blt 98f\n" @@ -1135,170 +1135,170 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x12, #0x10]\n" "blt 97f\n" "96:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "sub x27, x27, #0x8\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "cmp x27, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x12, #0x20]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x12, #0x30]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x11, #0x20]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" "add x12, x12, #0x40\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x11, #0x30]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x9, #0x20]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" "ldr q7, [x12, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x12, #0x10]\n" "bge 96b\n" "97:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "sub x27, x27, #0x8\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x12, #0x20]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x12, #0x30]\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" "add x12, x12, #0x40\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x11, #0x20]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x11, #0x30]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x9, #0x20]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "98:" // Height 3: Multiply loop: Main loop skip "cbz x27, 103f\n" "cmp x27, #0x4\n" "blt 100f\n" "99:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x12, #0x0]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr q26, [x12, #0x0]\n" + "trn1 v27.2d, v25.2d, v27.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + "ldr q25, [x12, #0x10]\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "sub x27, x27, #0x4\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "cmp x27, #0x4\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x12, x12, #0x20\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x11, x11, #0x20\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x10, x10, #0x20\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "bge 99b\n" "100:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 103f\n" @@ -1316,36 +1316,36 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr h2, [x25, #0x0]\n" "ldr h3, [x24, #0x0]\n" "102:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "ldr q6, [x12, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q29, [x12, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v25.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n" + ".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n" + "ldr q25, [x11, #0x10]\n" "add x12, x12, #0x20\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "103:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1368,33 +1368,33 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "uzp1 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 104f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "104:" // Height 3: No activation "cmp x14, #0x10\n" "bge 113f\n" @@ -1709,14 +1709,14 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "131:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 132f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 133f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1726,9 +1726,9 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "b 133f\n" "132:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "133:" // Height 4: input setup done "cmp x27, #0x8\n" "blt 136f\n" @@ -1741,174 +1741,174 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x12, #0x10]\n" "blt 135f\n" "134:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" "sub x27, x27, #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "cmp x27, #0x10\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x12, #0x20]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "add x23, x23, #0x10\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x12, #0x30]\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x11, #0x20]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x11, #0x30]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "add x12, x12, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x9, #0x20]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" "ldr q7, [x12, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x12, #0x10]\n" "bge 134b\n" "135:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" "sub x27, x27, #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x23, x23, #0x10\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x12, #0x20]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x12, #0x30]\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" "add x12, x12, #0x40\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x11, #0x20]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x11, #0x30]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x9, #0x20]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "136:" // Height 4: Multiply loop: Main loop skip "cbz x27, 141f\n" "cmp x27, #0x4\n" "blt 138f\n" "137:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "trn1 v27.2d, v26.2d, v25.2d\n" "cmp x27, #0x4\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q25, [x12, #0x10]\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x11, #0x0]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x11, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" "add x12, x12, #0x20\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x10]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "bge 137b\n" "138:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 141f\n" @@ -1929,36 +1929,36 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr h3, [x24, #0x0]\n" "ldr h4, [x23, #0x0]\n" "140:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "ldr q6, [x12, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q25, [x12, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x11, #0x0]\n" "add x12, x12, #0x20\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x11, #0x10]\n" "add x11, x11, #0x20\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x0]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x10]\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x9, #0x0]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "141:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1986,41 +1986,41 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "uzp2 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 142f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v21.4s, v21.4s, v26.4s\n" + "fmin v22.4s, v22.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v15.4s, v15.4s, v25.4s\n" + "fmax v20.4s, v20.4s, v25.4s\n" + "fmax v21.4s, v21.4s, v25.4s\n" + "fmax v22.4s, v22.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "142:" // Height 4: No activation "cmp x14, #0x10\n" "bge 151f\n" @@ -2400,15 +2400,15 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "169:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 170f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 171f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2419,10 +2419,10 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "b 171f\n" "170:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "171:" // Height 5: input setup done "cmp x27, #0x8\n" "blt 174f\n" @@ -2435,170 +2435,170 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr q7, [x12, #0x0]\n" "blt 173f\n" "172:" // Height 5: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" "sub x27, x27, #0x8\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x12, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x12, #0x10]\n" ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n" "cmp x27, #0x10\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n" + "ldr q0, [x11, #0x10]\n" + ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n" "add x26, x26, #0x10\n" ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x0]\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n" "add x22, x22, #0x10\n" ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n" + ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n" ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" + "ldr q0, [x12, #0x30]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" + "ldr q6, [x11, #0x20]\n" "add x12, x12, #0x40\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n" + "ldr q0, [x11, #0x30]\n" + ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n" + "ldr q0, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n" "ldr q7, [x12, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n" "ldr q5, [x22, #0x0]\n" "bge 172b\n" "173:" // Height 5: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" "sub x27, x27, #0x8\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x12, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x12, #0x10]\n" ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n" + "ldr q0, [x11, #0x10]\n" + ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n" "add x25, x25, #0x10\n" ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x0]\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n" ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n" + ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n" ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" "ldr q7, [x12, #0x20]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" + ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" + "ldr q2, [x12, #0x30]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" "add x12, x12, #0x40\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q0, [x11, #0x20]\n" + ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n" + "ldr q2, [x11, #0x30]\n" + ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0x30]\n" + ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n" + "ldr q0, [x9, #0x20]\n" + ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n" "ldr q6, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n" ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" @@ -2608,51 +2608,51 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "blt 176f\n" "175:" // Height 5: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x12, #0x0]\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr d0, [x22], #0x8\n" + "ldr q1, [x12, #0x0]\n" + "trn1 v2.2d, v0.2d, v2.2d\n" + ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n" + "ldr q0, [x12, #0x10]\n" + ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n" + "ldr q1, [x11, #0x0]\n" + ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" "cmp x27, #0x4\n" "add x12, x12, #0x20\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n" + "ldr q0, [x11, #0x10]\n" + ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x0]\n" + ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n" "add x10, x10, #0x20\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n" "bge 175b\n" "176:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 179f\n" @@ -2676,45 +2676,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr h4, [x23, #0x0]\n" "ldr h5, [x22, #0x0]\n" "178:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x12, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x12, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v5.2d, v0.2d\n" + "ldr q1, [x12, #0x10]\n" + ".inst 0x6e46ece8 // bfmmla v8.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec70 // bfmmla v16.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n" + "ldr q0, [x11, #0x0]\n" + ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n" "add x12, x12, #0x20\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" + ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n" + "ldr q1, [x11, #0x10]\n" "add x11, x11, #0x20\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" + ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x0]\n" + ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x10]\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n" + "ldr q0, [x9, #0x0]\n" + ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n" "ldr q6, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n" + ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n" "179:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3213,16 +3213,16 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "207:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 208f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 209f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -3234,11 +3234,11 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "b 209f\n" "208:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "209:" // Height 6: input setup done "cmp x27, #0x8\n" "blt 212f\n" @@ -3299,45 +3299,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" "ldr q2, [x25, #0x0]\n" ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" + "ldr q0, [x12, #0x30]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" + "ldr q6, [x11, #0x20]\n" "add x12, x12, #0x40\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n" + "ldr q0, [x11, #0x30]\n" + ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x9, #0x30]\n" + ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n" + "ldr q6, [x9, #0x20]\n" + ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n" + "ldr q0, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n" "ldr q7, [x12, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "bge 210b\n" @@ -3387,38 +3387,38 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x12, #0x30]\n" + "ldr q2, [x12, #0x30]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" "add x12, x12, #0x40\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x11, #0x20]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x11, #0x30]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q0, [x11, #0x20]\n" + ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n" + "ldr q2, [x11, #0x30]\n" + ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n" "add x11, x11, #0x40\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0x30]\n" + ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n" "add x10, x10, #0x40\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n" + "ldr q0, [x9, #0x20]\n" + ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n" "ldr q6, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n" ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" @@ -3428,52 +3428,52 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "blt 214f\n" "213:" // Height 6: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "cmp x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x12, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr d1, [x22], #0x8\n" + "ldr d0, [x21], #0x8\n" + "trn1 v2.2d, v1.2d, v0.2d\n" + "ldr q1, [x12, #0x0]\n" + "ldr q0, [x12, #0x10]\n" + ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n" + "ldr q1, [x11, #0x0]\n" + ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n" "add x12, x12, #0x20\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x11, #0x10]\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n" + "ldr q0, [x11, #0x10]\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x10]\n" + ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x0]\n" + ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x10]\n" "add x10, x10, #0x20\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" + ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n" + "ldr q0, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n" "bge 213b\n" "214:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 217f\n" @@ -3500,45 +3500,45 @@ void a64_ffhybrid_bf16fp32_mmla_6x16 ( "ldr h5, [x22, #0x0]\n" "ldr h6, [x21, #0x0]\n" "216:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x12, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x12, #0x10]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x11, #0x0]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q0, [x12, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n" + "trn1 v2.2d, v5.2d, v6.2d\n" + "ldr q1, [x12, #0x10]\n" + ".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n" + "ldr q0, [x11, #0x0]\n" + ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n" "add x12, x12, #0x20\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x11, #0x10]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n" + "ldr q1, [x11, #0x10]\n" + ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n" "add x11, x11, #0x20\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x0]\n" + ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x9, #0x0]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n" + "ldr q0, [x9, #0x0]\n" + ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n" "ldr q6, [x9, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n" + ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n" "217:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp index f7506e5123..4924b3a549 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp index 18a2db5069..8038612200 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp16_mla_6x32/generic.cpp @@ -265,11 +265,11 @@ void a64_ffhybrid_fp16_mla_6x32 ( "24:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 25f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 26f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -286,69 +286,69 @@ void a64_ffhybrid_fp16_mla_6x32 ( "blt 28f\n" "27:" // Height 1: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x11, #0x70]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x12, #0x10]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x9, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x12, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x11, #0x40]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x9, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x12, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x11, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x50]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x9, #0x50]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x12, #0x60]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x11, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x9, #0x60]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x12, #0x70]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x11, #0x70]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr q17, [x10, #0x70]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr q16, [x9, #0x70]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "add x26, x26, #0x10\n" "ldr q0, [x26, #0x0]\n" "add x12, x12, #0x80\n" @@ -360,68 +360,68 @@ void a64_ffhybrid_fp16_mla_6x32 ( "bge 27b\n" "28:" // Height 1: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x11, #0x70]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x12, #0x10]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x9, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x12, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x11, #0x40]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x9, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x12, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x11, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x50]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x9, #0x50]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x12, #0x60]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x11, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x9, #0x60]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x12, #0x70]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x11, #0x70]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr q17, [x10, #0x70]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr q16, [x9, #0x70]\n" "sub x27, x27, #0x8\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "add x26, x26, #0x10\n" "add x12, x12, #0x80\n" "add x11, x11, #0x80\n" @@ -431,15 +431,15 @@ void a64_ffhybrid_fp16_mla_6x32 ( "cbz x27, 31f\n" "30:" // Height 1: Multiply loop: Odd block loop "ldr h0, [x26], #0x2\n" - "ldr q6, [x12, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q16, [x12, #0x0]\n" + "fmla v8.8h, v16.8h, v0.h[0]\n" "sub x27, x27, #0x1\n" - "ldr q7, [x11, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q17, [x11, #0x0]\n" + "ldr q16, [x10, #0x0]\n" + "fmla v9.8h, v17.8h, v0.h[0]\n" + "fmla v10.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" @@ -452,17 +452,17 @@ void a64_ffhybrid_fp16_mla_6x32 ( "bne 24b\n" "tbz %x[flags], #1, 32f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v17.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" "32:" // Height 1: No activation "cmp x14, #0x20\n" "bge 49f\n" @@ -778,12 +778,12 @@ void a64_ffhybrid_fp16_mla_6x32 ( "74:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 75f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 76f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -791,7 +791,7 @@ void a64_ffhybrid_fp16_mla_6x32 ( "b 76f\n" "75:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "76:" // Height 2: input setup done "cmp x27, #0x8\n" "blt 79f\n" @@ -804,233 +804,233 @@ void a64_ffhybrid_fp16_mla_6x32 ( "77:" // Height 2: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "sub x27, x27, #0x8\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q16, [x9, #0x0]\n" "cmp x27, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x12, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr q17, [x12, #0x10]\n" "add x26, x26, #0x10\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x11, #0x10]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr q16, [x11, #0x10]\n" "add x25, x25, #0x10\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x12, #0x70]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x9, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x12, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x11, #0x40]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x9, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x12, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x11, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x50]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x9, #0x50]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x12, #0x60]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x11, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x9, #0x60]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x12, #0x70]\n" "add x12, x12, #0x80\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x10, #0x70]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr q17, [x10, #0x70]\n" "add x10, x10, #0x80\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr q16, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 77b\n" "78:" // Height 2: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "sub x27, x27, #0x8\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q16, [x9, #0x0]\n" "add x26, x26, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x12, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr q17, [x12, #0x10]\n" "add x25, x25, #0x10\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x12, #0x70]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x9, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x12, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x11, #0x40]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x9, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x12, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x11, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x50]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x9, #0x50]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x12, #0x60]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x11, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x9, #0x60]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x12, #0x70]\n" "add x12, x12, #0x80\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x10, #0x70]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr q17, [x10, #0x70]\n" "add x10, x10, #0x80\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr q16, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" "79:" // Height 2: Multiply loop: Main loop skip "cbz x27, 81f\n" "80:" // Height 2: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q17, [x12, #0x0]\n" + "ldr q16, [x11, #0x0]\n" + "fmla v8.8h, v17.8h, v1.h[0]\n" + "fmla v12.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x0]\n" + "fmla v9.8h, v16.8h, v1.h[0]\n" + "fmla v13.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.8h, v17.8h, v1.h[0]\n" + "fmla v14.8h, v17.8h, v0.h[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v11.8h, v16.8h, v1.h[0]\n" + "fmla v15.8h, v16.8h, v0.h[0]\n" "add x10, x10, #0x10\n" "add x9, x9, #0x10\n" "cbnz x27, 80b\n" @@ -1043,25 +1043,25 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x25, x13, x20, LSL #1\n" "tbz %x[flags], #1, 82f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v17.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmin v12.8h, v12.8h, v17.8h\n" + "fmin v13.8h, v13.8h, v17.8h\n" + "fmin v14.8h, v14.8h, v17.8h\n" + "fmin v15.8h, v15.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" + "fmax v12.8h, v12.8h, v16.8h\n" + "fmax v13.8h, v13.8h, v16.8h\n" + "fmax v14.8h, v14.8h, v16.8h\n" + "fmax v15.8h, v15.8h, v16.8h\n" "82:" // Height 2: No activation "cmp x14, #0x20\n" "bge 99f\n" @@ -1458,13 +1458,13 @@ void a64_ffhybrid_fp16_mla_6x32 ( "124:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 125f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 126f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1473,8 +1473,8 @@ void a64_ffhybrid_fp16_mla_6x32 ( "b 126f\n" "125:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "126:" // Height 3: input setup done "cmp x27, #0x8\n" "blt 129f\n" @@ -1491,139 +1491,139 @@ void a64_ffhybrid_fp16_mla_6x32 ( "sub x27, x27, #0x8\n" "cmp x27, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q21, [x10, #0x0]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "add x26, x26, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q20, [x9, #0x0]\n" "add x25, x25, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" "add x24, x24, #0x10\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr q21, [x12, #0x10]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr q20, [x11, #0x10]\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x10]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x9, #0x10]\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x12, #0x20]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x11, #0x20]\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x9, #0x20]\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x12, #0x30]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x11, #0x30]\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0x30]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x9, #0x30]\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x12, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x11, #0x40]\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x9, #0x40]\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x12, #0x50]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x11, #0x50]\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x50]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x9, #0x50]\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x12, #0x60]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x11, #0x60]\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x9, #0x60]\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x12, #0x70]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" "add x12, x12, #0x80\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr q21, [x10, #0x70]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" "add x10, x10, #0x80\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr q20, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 127b\n" @@ -1633,162 +1633,162 @@ void a64_ffhybrid_fp16_mla_6x32 ( "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q21, [x10, #0x0]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "add x25, x25, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q20, [x9, #0x0]\n" "add x24, x24, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr q21, [x12, #0x10]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr q20, [x11, #0x10]\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x10]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x9, #0x10]\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x12, #0x20]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x11, #0x20]\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x9, #0x20]\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x12, #0x30]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x11, #0x30]\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0x30]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x9, #0x30]\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x12, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x11, #0x40]\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x9, #0x40]\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x12, #0x50]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x11, #0x50]\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x50]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x9, #0x50]\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x12, #0x60]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x11, #0x60]\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x9, #0x60]\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x12, #0x70]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" "add x12, x12, #0x80\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr q21, [x10, #0x70]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" "add x10, x10, #0x80\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr q20, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" "129:" // Height 3: Multiply loop: Main loop skip "cbz x27, 131f\n" "130:" // Height 3: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" + "ldr h2, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr q6, [x12, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr h0, [x24], #0x2\n" + "ldr q21, [x12, #0x0]\n" + "fmla v8.8h, v21.8h, v2.h[0]\n" + "fmla v12.8h, v21.8h, v1.h[0]\n" + "ldr q20, [x11, #0x0]\n" + "fmla v16.8h, v21.8h, v0.h[0]\n" + "ldr q21, [x10, #0x0]\n" + "fmla v9.8h, v20.8h, v2.h[0]\n" + "fmla v13.8h, v20.8h, v1.h[0]\n" + "fmla v17.8h, v20.8h, v0.h[0]\n" + "ldr q20, [x9, #0x0]\n" "add x12, x12, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v2.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v18.8h, v21.8h, v0.h[0]\n" + "fmla v11.8h, v20.8h, v2.h[0]\n" "add x9, x9, #0x10\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v0.h[0]\n" "cbnz x27, 130b\n" "131:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1800,33 +1800,33 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x24, x25, x20, LSL #1\n" "tbz %x[flags], #1, 132f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v21.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" + "ld1r { v20.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v21.8h\n" + "fmin v9.8h, v9.8h, v21.8h\n" + "fmin v10.8h, v10.8h, v21.8h\n" + "fmin v11.8h, v11.8h, v21.8h\n" + "fmin v12.8h, v12.8h, v21.8h\n" + "fmin v13.8h, v13.8h, v21.8h\n" + "fmin v14.8h, v14.8h, v21.8h\n" + "fmin v15.8h, v15.8h, v21.8h\n" + "fmin v16.8h, v16.8h, v21.8h\n" + "fmin v17.8h, v17.8h, v21.8h\n" + "fmin v18.8h, v18.8h, v21.8h\n" + "fmin v19.8h, v19.8h, v21.8h\n" + "fmax v8.8h, v8.8h, v20.8h\n" + "fmax v9.8h, v9.8h, v20.8h\n" + "fmax v10.8h, v10.8h, v20.8h\n" + "fmax v11.8h, v11.8h, v20.8h\n" + "fmax v12.8h, v12.8h, v20.8h\n" + "fmax v13.8h, v13.8h, v20.8h\n" + "fmax v14.8h, v14.8h, v20.8h\n" + "fmax v15.8h, v15.8h, v20.8h\n" + "fmax v16.8h, v16.8h, v20.8h\n" + "fmax v17.8h, v17.8h, v20.8h\n" + "fmax v18.8h, v18.8h, v20.8h\n" + "fmax v19.8h, v19.8h, v20.8h\n" "132:" // Height 3: No activation "cmp x14, #0x20\n" "bge 149f\n" @@ -2304,14 +2304,14 @@ void a64_ffhybrid_fp16_mla_6x32 ( "174:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 175f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 176f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2321,9 +2321,9 @@ void a64_ffhybrid_fp16_mla_6x32 ( "b 176f\n" "175:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "176:" // Height 4: input setup done "cmp x27, #0x8\n" "blt 179f\n" @@ -2342,7 +2342,7 @@ void a64_ffhybrid_fp16_mla_6x32 ( "cmp x27, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q25, [x10, #0x0]\n" "add x26, x26, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" @@ -2350,164 +2350,164 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x24, x24, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q24, [x9, #0x0]\n" "add x23, x23, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x12, #0x70]\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr q25, [x12, #0x10]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr q24, [x11, #0x10]\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x10]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x9, #0x10]\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x12, #0x20]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x11, #0x20]\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x9, #0x20]\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x12, #0x30]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x11, #0x30]\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0x30]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x9, #0x30]\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x12, #0x40]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x11, #0x40]\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x40]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x9, #0x40]\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x12, #0x50]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x11, #0x50]\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x50]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x9, #0x50]\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x12, #0x60]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x11, #0x60]\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x9, #0x60]\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x12, #0x70]\n" "add x12, x12, #0x80\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x10, #0x70]\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr q24, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 177b\n" @@ -2518,7 +2518,7 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x26, x26, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q25, [x10, #0x0]\n" "add x25, x25, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" @@ -2526,191 +2526,191 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x23, x23, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x12, #0x70]\n" + "ldr q24, [x9, #0x0]\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr q25, [x12, #0x10]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr q24, [x11, #0x10]\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x10]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x9, #0x10]\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x12, #0x20]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x11, #0x20]\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x9, #0x20]\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x12, #0x30]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x11, #0x30]\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0x30]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x9, #0x30]\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x12, #0x40]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x11, #0x40]\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x40]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x9, #0x40]\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x12, #0x50]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x11, #0x50]\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x50]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x9, #0x50]\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x12, #0x60]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x11, #0x60]\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x9, #0x60]\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x12, #0x70]\n" "add x12, x12, #0x80\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x10, #0x70]\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr q24, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" "179:" // Height 4: Multiply loop: Main loop skip "cbz x27, 181f\n" "180:" // Height 4: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h3, [x26], #0x2\n" + "ldr h2, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr h1, [x24], #0x2\n" + "ldr h0, [x23], #0x2\n" + "ldr q25, [x12, #0x0]\n" + "ldr q24, [x11, #0x0]\n" + "fmla v8.8h, v25.8h, v3.h[0]\n" + "fmla v12.8h, v25.8h, v2.h[0]\n" + "fmla v16.8h, v25.8h, v1.h[0]\n" + "fmla v20.8h, v25.8h, v0.h[0]\n" + "ldr q25, [x10, #0x0]\n" "add x12, x12, #0x10\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v9.8h, v24.8h, v3.h[0]\n" + "fmla v13.8h, v24.8h, v2.h[0]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "fmla v17.8h, v24.8h, v1.h[0]\n" + "fmla v21.8h, v24.8h, v0.h[0]\n" + "ldr q24, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v10.8h, v25.8h, v3.h[0]\n" + "fmla v14.8h, v25.8h, v2.h[0]\n" + "fmla v18.8h, v25.8h, v1.h[0]\n" + "fmla v22.8h, v25.8h, v0.h[0]\n" + "fmla v11.8h, v24.8h, v3.h[0]\n" + "fmla v15.8h, v24.8h, v2.h[0]\n" + "fmla v19.8h, v24.8h, v1.h[0]\n" + "fmla v23.8h, v24.8h, v0.h[0]\n" "cbnz x27, 180b\n" "181:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2723,41 +2723,41 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x23, x24, x20, LSL #1\n" "tbz %x[flags], #1, 182f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v25.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" + "ld1r { v24.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v25.8h\n" + "fmin v9.8h, v9.8h, v25.8h\n" + "fmin v10.8h, v10.8h, v25.8h\n" + "fmin v11.8h, v11.8h, v25.8h\n" + "fmin v12.8h, v12.8h, v25.8h\n" + "fmin v13.8h, v13.8h, v25.8h\n" + "fmin v14.8h, v14.8h, v25.8h\n" + "fmin v15.8h, v15.8h, v25.8h\n" + "fmin v16.8h, v16.8h, v25.8h\n" + "fmin v17.8h, v17.8h, v25.8h\n" + "fmin v18.8h, v18.8h, v25.8h\n" + "fmin v19.8h, v19.8h, v25.8h\n" + "fmin v20.8h, v20.8h, v25.8h\n" + "fmin v21.8h, v21.8h, v25.8h\n" + "fmin v22.8h, v22.8h, v25.8h\n" + "fmin v23.8h, v23.8h, v25.8h\n" + "fmax v8.8h, v8.8h, v24.8h\n" + "fmax v9.8h, v9.8h, v24.8h\n" + "fmax v10.8h, v10.8h, v24.8h\n" + "fmax v11.8h, v11.8h, v24.8h\n" + "fmax v12.8h, v12.8h, v24.8h\n" + "fmax v13.8h, v13.8h, v24.8h\n" + "fmax v14.8h, v14.8h, v24.8h\n" + "fmax v15.8h, v15.8h, v24.8h\n" + "fmax v16.8h, v16.8h, v24.8h\n" + "fmax v17.8h, v17.8h, v24.8h\n" + "fmax v18.8h, v18.8h, v24.8h\n" + "fmax v19.8h, v19.8h, v24.8h\n" + "fmax v20.8h, v20.8h, v24.8h\n" + "fmax v21.8h, v21.8h, v24.8h\n" + "fmax v22.8h, v22.8h, v24.8h\n" + "fmax v23.8h, v23.8h, v24.8h\n" "182:" // Height 4: No activation "cmp x14, #0x20\n" "bge 199f\n" @@ -3256,549 +3256,549 @@ void a64_ffhybrid_fp16_mla_6x32 ( "ld1 { v12.h }[2], [x25]\n" "ld1 { v16.h }[2], [x24]\n" "ld1 { v20.h }[2], [x23]\n" - "ld1 { v24.h }[2], [x22]\n" - "b 220f\n" - "219:" // Height 5: Partial accumulate: partial_1_0 - "ldr h8, [x13, #0x0]\n" - "ldr h12, [x25, #0x0]\n" - "mov x20, #0x0\n" - "ldr h16, [x24, #0x0]\n" - "ldr h20, [x23, #0x0]\n" - "ldr h24, [x22, #0x0]\n" - "220:" // Height 5: Partial accumulate: Done - "sub x13, x13, x20\n" - "b 223f\n" - "221:" // Height 5: full accumulate - "ldr q8, [x13, #0x0]\n" - "ldr q9, [x13, #0x10]\n" - "ldr q10, [x13, #0x20]\n" - "ldr q11, [x13, #0x30]\n" - "ldr q12, [x25, #0x0]\n" - "ldr q13, [x25, #0x10]\n" - "ldr q14, [x25, #0x20]\n" - "ldr q15, [x25, #0x30]\n" - "ldr q16, [x24, #0x0]\n" - "ldr q17, [x24, #0x10]\n" - "ldr q18, [x24, #0x20]\n" - "ldr q19, [x24, #0x30]\n" - "ldr q20, [x23, #0x0]\n" - "ldr q21, [x23, #0x10]\n" - "ldr q22, [x23, #0x20]\n" - "ldr q23, [x23, #0x30]\n" - "ldr q24, [x22, #0x0]\n" - "ldr q25, [x22, #0x10]\n" - "ldr q26, [x22, #0x20]\n" - "ldr q27, [x22, #0x30]\n" - "b 223f\n" - "222:" // Height 5: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "223:" // Height 5: setup done - "mov x28, #0x0\n" - "224:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 225f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 226f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "add x22, x22, x20, LSL #1\n" - "b 226f\n" - "225:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "226:" // Height 5: input setup done - "cmp x27, #0x8\n" - "blt 229f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "blt 228f\n" - "227:" // Height 5: Multiply loop: Main loop head - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x27, x27, #0x8\n" - "cmp x27, #0x10\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x24, x24, #0x10\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "add x12, x12, #0x80\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x11, #0x70]\n" - "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "add x10, x10, #0x80\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x9, #0x70]\n" - "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x12, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr q2, [x24, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "ldr q3, [x23, #0x0]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "bge 227b\n" - "228:" // Height 5: Multiply loop: Single iteration only - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x23, x23, #0x10\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x22, x22, #0x10\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x9, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x12, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x11, #0x40]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x9, #0x40]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x12, #0x50]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x11, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x50]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x9, #0x50]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x12, #0x60]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x11, #0x60]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x9, #0x60]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x12, #0x70]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" + "ld1 { v24.h }[2], [x22]\n" + "b 220f\n" + "219:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x13, #0x0]\n" + "ldr h12, [x25, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "220:" // Height 5: Partial accumulate: Done + "sub x13, x13, x20\n" + "b 223f\n" + "221:" // Height 5: full accumulate + "ldr q8, [x13, #0x0]\n" + "ldr q9, [x13, #0x10]\n" + "ldr q10, [x13, #0x20]\n" + "ldr q11, [x13, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "b 223f\n" + "222:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "223:" // Height 5: setup done + "mov x28, #0x0\n" + "224:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 225f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 226f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "b 226f\n" + "225:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "226:" // Height 5: input setup done + "cmp x27, #0x8\n" + "blt 229f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x11, #0x0]\n" + "blt 228f\n" + "227:" // Height 5: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "cmp x27, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q29, [x10, #0x0]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q28, [x9, #0x0]\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr q29, [x12, #0x10]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr q28, [x11, #0x10]\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x10]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x9, #0x10]\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x12, #0x20]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x11, #0x20]\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x9, #0x20]\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x12, #0x30]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x11, #0x30]\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0x30]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x9, #0x30]\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x12, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x11, #0x40]\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x9, #0x40]\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x12, #0x50]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x11, #0x50]\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x50]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x9, #0x50]\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x12, #0x60]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x11, #0x60]\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x9, #0x60]\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x12, #0x70]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" "add x12, x12, #0x80\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x11, #0x70]\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x11, #0x70]\n" "add x11, x11, #0x80\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x10, #0x70]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr q29, [x10, #0x70]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" "add x10, x10, #0x80\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x9, #0x70]\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr q28, [x9, #0x70]\n" "add x9, x9, #0x80\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "229:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 231f\n" - "230:" // Height 5: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" - "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr h4, [x22], #0x2\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" "ldr q6, [x12, #0x0]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q7, [x11, #0x0]\n" + "bge 227b\n" + "228:" // Height 5: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q7, [x11, #0x0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x12, x12, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q29, [x10, #0x0]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x11, x11, #0x10\n" + "add x23, x23, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x10, x10, #0x10\n" + "add x22, x22, #0x10\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q28, [x9, #0x0]\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr q29, [x12, #0x10]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr q28, [x11, #0x10]\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x10]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x9, #0x10]\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x12, #0x20]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x11, #0x20]\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x9, #0x20]\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x12, #0x30]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x11, #0x30]\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0x30]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x9, #0x30]\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x12, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x11, #0x40]\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x9, #0x40]\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x12, #0x50]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x11, #0x50]\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x50]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x9, #0x50]\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x12, #0x60]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x11, #0x60]\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x9, #0x60]\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x12, #0x70]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" + "add x12, x12, #0x80\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x11, #0x70]\n" + "add x11, x11, #0x80\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr q29, [x10, #0x70]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" + "add x10, x10, #0x80\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr q28, [x9, #0x70]\n" + "add x9, x9, #0x80\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "229:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 231f\n" + "230:" // Height 5: Multiply loop: Odd block loop + "ldr h4, [x26], #0x2\n" + "ldr h3, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h1, [x23], #0x2\n" + "ldr h0, [x22], #0x2\n" + "ldr q29, [x12, #0x0]\n" + "fmla v8.8h, v29.8h, v4.h[0]\n" + "fmla v12.8h, v29.8h, v3.h[0]\n" + "ldr q28, [x11, #0x0]\n" + "fmla v16.8h, v29.8h, v2.h[0]\n" + "fmla v20.8h, v29.8h, v1.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v24.8h, v29.8h, v0.h[0]\n" + "ldr q29, [x10, #0x0]\n" + "fmla v9.8h, v28.8h, v4.h[0]\n" + "add x11, x11, #0x10\n" + "fmla v13.8h, v28.8h, v3.h[0]\n" + "fmla v17.8h, v28.8h, v2.h[0]\n" + "add x10, x10, #0x10\n" + "fmla v21.8h, v28.8h, v1.h[0]\n" + "fmla v25.8h, v28.8h, v0.h[0]\n" + "ldr q28, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v10.8h, v29.8h, v4.h[0]\n" + "fmla v14.8h, v29.8h, v3.h[0]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v1.h[0]\n" + "fmla v26.8h, v29.8h, v0.h[0]\n" + "fmla v11.8h, v28.8h, v4.h[0]\n" + "fmla v15.8h, v28.8h, v3.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v1.h[0]\n" + "fmla v27.8h, v28.8h, v0.h[0]\n" "cbnz x27, 230b\n" "231:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -3812,49 +3812,49 @@ void a64_ffhybrid_fp16_mla_6x32 ( "add x22, x23, x20, LSL #1\n" "tbz %x[flags], #1, 232f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v29.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fmin v27.8h, v27.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmax v27.8h, v27.8h, v0.8h\n" + "ld1r { v28.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v29.8h\n" + "fmin v9.8h, v9.8h, v29.8h\n" + "fmin v10.8h, v10.8h, v29.8h\n" + "fmin v11.8h, v11.8h, v29.8h\n" + "fmin v12.8h, v12.8h, v29.8h\n" + "fmin v13.8h, v13.8h, v29.8h\n" + "fmin v14.8h, v14.8h, v29.8h\n" + "fmin v15.8h, v15.8h, v29.8h\n" + "fmin v16.8h, v16.8h, v29.8h\n" + "fmin v17.8h, v17.8h, v29.8h\n" + "fmin v18.8h, v18.8h, v29.8h\n" + "fmin v19.8h, v19.8h, v29.8h\n" + "fmin v20.8h, v20.8h, v29.8h\n" + "fmin v21.8h, v21.8h, v29.8h\n" + "fmin v22.8h, v22.8h, v29.8h\n" + "fmin v23.8h, v23.8h, v29.8h\n" + "fmin v24.8h, v24.8h, v29.8h\n" + "fmin v25.8h, v25.8h, v29.8h\n" + "fmin v26.8h, v26.8h, v29.8h\n" + "fmin v27.8h, v27.8h, v29.8h\n" + "fmax v8.8h, v8.8h, v28.8h\n" + "fmax v9.8h, v9.8h, v28.8h\n" + "fmax v10.8h, v10.8h, v28.8h\n" + "fmax v11.8h, v11.8h, v28.8h\n" + "fmax v12.8h, v12.8h, v28.8h\n" + "fmax v13.8h, v13.8h, v28.8h\n" + "fmax v14.8h, v14.8h, v28.8h\n" + "fmax v15.8h, v15.8h, v28.8h\n" + "fmax v16.8h, v16.8h, v28.8h\n" + "fmax v17.8h, v17.8h, v28.8h\n" + "fmax v18.8h, v18.8h, v28.8h\n" + "fmax v19.8h, v19.8h, v28.8h\n" + "fmax v20.8h, v20.8h, v28.8h\n" + "fmax v21.8h, v21.8h, v28.8h\n" + "fmax v22.8h, v22.8h, v28.8h\n" + "fmax v23.8h, v23.8h, v28.8h\n" + "fmax v24.8h, v24.8h, v28.8h\n" + "fmax v25.8h, v25.8h, v28.8h\n" + "fmax v26.8h, v26.8h, v28.8h\n" + "fmax v27.8h, v27.8h, v28.8h\n" "232:" // Height 5: No activation "cmp x14, #0x20\n" "bge 249f\n" @@ -4497,16 +4497,16 @@ void a64_ffhybrid_fp16_mla_6x32 ( "274:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 275f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 276f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -4518,11 +4518,11 @@ void a64_ffhybrid_fp16_mla_6x32 ( "b 276f\n" "275:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "276:" // Height 6: input setup done "cmp x27, #0x8\n" "blt 279f\n" @@ -5017,45 +5017,45 @@ void a64_ffhybrid_fp16_mla_6x32 ( "279:" // Height 6: Multiply loop: Main loop skip "cbz x27, 281f\n" "280:" // Height 6: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h7, [x26], #0x2\n" + "ldr h6, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr h4, [x22], #0x2\n" - "ldr h5, [x21], #0x2\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr h5, [x24], #0x2\n" + "ldr h4, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr q1, [x12, #0x0]\n" + "ldr q0, [x11, #0x0]\n" + "fmla v8.8h, v1.8h, v7.h[0]\n" + "fmla v12.8h, v1.8h, v6.h[0]\n" + "fmla v16.8h, v1.8h, v5.h[0]\n" + "fmla v20.8h, v1.8h, v4.h[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "fmla v28.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x10, #0x0]\n" + "fmla v24.8h, v1.8h, v3.h[0]\n" + "fmla v28.8h, v1.8h, v2.h[0]\n" + "ldr q1, [x10, #0x0]\n" "add x10, x10, #0x10\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x9, #0x0]\n" + "fmla v9.8h, v0.8h, v7.h[0]\n" + "fmla v13.8h, v0.8h, v6.h[0]\n" + "fmla v17.8h, v0.8h, v5.h[0]\n" + "fmla v21.8h, v0.8h, v4.h[0]\n" + "fmla v25.8h, v0.8h, v3.h[0]\n" + "fmla v29.8h, v0.8h, v2.h[0]\n" + "ldr q0, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v30.8h, v6.8h, v5.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "fmla v31.8h, v7.8h, v5.h[0]\n" + "fmla v10.8h, v1.8h, v7.h[0]\n" + "fmla v14.8h, v1.8h, v6.h[0]\n" + "fmla v18.8h, v1.8h, v5.h[0]\n" + "fmla v22.8h, v1.8h, v4.h[0]\n" + "fmla v26.8h, v1.8h, v3.h[0]\n" + "fmla v30.8h, v1.8h, v2.h[0]\n" + "fmla v11.8h, v0.8h, v7.h[0]\n" + "fmla v15.8h, v0.8h, v6.h[0]\n" + "fmla v19.8h, v0.8h, v5.h[0]\n" + "fmla v23.8h, v0.8h, v4.h[0]\n" + "fmla v27.8h, v0.8h, v3.h[0]\n" + "fmla v31.8h, v0.8h, v2.h[0]\n" "cbnz x27, 280b\n" "281:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp index 08f5aeb2d8..94fb84e409 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp index e0fbe17bad..b1cd6dc970 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32_mla_6x16/generic.cpp @@ -209,11 +209,11 @@ void a64_ffhybrid_fp32_mla_6x16 ( "16:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -230,37 +230,37 @@ void a64_ffhybrid_fp32_mla_6x16 ( "blt 20f\n" "19:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr q17, [x12, #0x10]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x9, #0x30]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x8\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "add x26, x26, #0x10\n" "ldr q0, [x26, #0x0]\n" "add x12, x12, #0x40\n" @@ -272,36 +272,36 @@ void a64_ffhybrid_fp32_mla_6x16 ( "bge 19b\n" "20:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x11, #0x30]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr q17, [x12, #0x10]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x12, #0x30]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x11, #0x30]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x10, #0x30]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x9, #0x30]\n" "sub x27, x27, #0x4\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "add x26, x26, #0x10\n" "add x12, x12, #0x40\n" "add x11, x11, #0x40\n" @@ -310,16 +310,16 @@ void a64_ffhybrid_fp32_mla_6x16 ( "21:" // Height 1: Multiply loop: Main loop skip "cbz x27, 23f\n" "22:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x12, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v8.4s, v16.4s, v18.s[0]\n" "sub x27, x27, #0x1\n" - "ldr q7, [x11, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q17, [x11, #0x0]\n" + "ldr q16, [x10, #0x0]\n" + "fmla v9.4s, v17.4s, v18.s[0]\n" + "fmla v10.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v11.4s, v16.4s, v18.s[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" @@ -332,17 +332,17 @@ void a64_ffhybrid_fp32_mla_6x16 ( "bne 16b\n" "tbz %x[flags], #1, 24f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" "24:" // Height 1: No activation "cmp x14, #0x10\n" "bge 33f\n" @@ -538,12 +538,12 @@ void a64_ffhybrid_fp32_mla_6x16 ( "50:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 51f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 52f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -551,7 +551,7 @@ void a64_ffhybrid_fp32_mla_6x16 ( "b 52f\n" "51:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "52:" // Height 2: input setup done "cmp x27, #0x4\n" "blt 55f\n" @@ -564,137 +564,137 @@ void a64_ffhybrid_fp32_mla_6x16 ( "53:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "sub x27, x27, #0x4\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q16, [x9, #0x0]\n" "cmp x27, #0x8\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x12, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr q17, [x12, #0x10]\n" "add x26, x26, #0x10\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x11, #0x10]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr q16, [x11, #0x10]\n" "add x25, x25, #0x10\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x12, #0x30]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x12, #0x30]\n" "add x12, x12, #0x40\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x10, #0x30]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr q17, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr q16, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 53b\n" "54:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q17, [x10, #0x0]\n" "sub x27, x27, #0x4\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q16, [x9, #0x0]\n" "add x26, x26, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x12, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr q17, [x12, #0x10]\n" "add x25, x25, #0x10\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x12, #0x30]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr q16, [x11, #0x10]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x10]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x9, #0x10]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x12, #0x20]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x11, #0x20]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x9, #0x20]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x12, #0x30]\n" "add x12, x12, #0x40\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x10, #0x30]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr q17, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr q16, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" "55:" // Height 2: Multiply loop: Main loop skip "cbz x27, 57f\n" "56:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr q17, [x12, #0x0]\n" + "ldr q16, [x11, #0x0]\n" + "fmla v8.4s, v17.4s, v19.s[0]\n" + "fmla v12.4s, v17.4s, v18.s[0]\n" + "ldr q17, [x10, #0x0]\n" + "fmla v9.4s, v16.4s, v19.s[0]\n" + "fmla v13.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x9, #0x0]\n" + "fmla v10.4s, v17.4s, v19.s[0]\n" + "fmla v14.4s, v17.4s, v18.s[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v11.4s, v16.4s, v19.s[0]\n" + "fmla v15.4s, v16.4s, v18.s[0]\n" "add x10, x10, #0x10\n" "add x9, x9, #0x10\n" "cbnz x27, 56b\n" @@ -707,25 +707,25 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x25, x13, x20, LSL #2\n" "tbz %x[flags], #1, 58f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmin v12.4s, v12.4s, v17.4s\n" + "fmin v13.4s, v13.4s, v17.4s\n" + "fmin v14.4s, v14.4s, v17.4s\n" + "fmin v15.4s, v15.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" + "fmax v14.4s, v14.4s, v16.4s\n" + "fmax v15.4s, v15.4s, v16.4s\n" "58:" // Height 2: No activation "cmp x14, #0x10\n" "bge 67f\n" @@ -970,13 +970,13 @@ void a64_ffhybrid_fp32_mla_6x16 ( "84:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 85f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 86f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -985,8 +985,8 @@ void a64_ffhybrid_fp32_mla_6x16 ( "b 86f\n" "85:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "86:" // Height 3: input setup done "cmp x27, #0x4\n" "blt 89f\n" @@ -1003,75 +1003,75 @@ void a64_ffhybrid_fp32_mla_6x16 ( "sub x27, x27, #0x4\n" "cmp x27, #0x8\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q21, [x10, #0x0]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x26, x26, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q20, [x9, #0x0]\n" "add x25, x25, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" "add x24, x24, #0x10\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr q21, [x12, #0x10]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr q20, [x11, #0x10]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x10]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x9, #0x10]\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x12, #0x20]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x11, #0x20]\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x9, #0x20]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x12, #0x30]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" "add x12, x12, #0x40\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr q21, [x10, #0x30]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" "add x10, x10, #0x40\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr q20, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 87b\n" @@ -1081,98 +1081,98 @@ void a64_ffhybrid_fp32_mla_6x16 ( "sub x27, x27, #0x4\n" "add x26, x26, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q21, [x10, #0x0]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x25, x25, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q20, [x9, #0x0]\n" "add x24, x24, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr q21, [x12, #0x10]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr q20, [x11, #0x10]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x10]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x9, #0x10]\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x12, #0x20]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x11, #0x20]\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x9, #0x20]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x12, #0x30]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" "add x12, x12, #0x40\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr q21, [x10, #0x30]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" "add x10, x10, #0x40\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr q20, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" "89:" // Height 3: Multiply loop: Main loop skip "cbz x27, 91f\n" "90:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x12, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x12, #0x0]\n" + "fmla v8.4s, v21.4s, v24.s[0]\n" + "fmla v12.4s, v21.4s, v23.s[0]\n" + "ldr q20, [x11, #0x0]\n" + "fmla v16.4s, v21.4s, v22.s[0]\n" + "ldr q21, [x10, #0x0]\n" + "fmla v9.4s, v20.4s, v24.s[0]\n" + "fmla v13.4s, v20.4s, v23.s[0]\n" + "fmla v17.4s, v20.4s, v22.s[0]\n" + "ldr q20, [x9, #0x0]\n" "add x12, x12, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v21.4s, v24.s[0]\n" + "fmla v14.4s, v21.4s, v23.s[0]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v18.4s, v21.4s, v22.s[0]\n" + "fmla v11.4s, v20.4s, v24.s[0]\n" "add x9, x9, #0x10\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v15.4s, v20.4s, v23.s[0]\n" + "fmla v19.4s, v20.4s, v22.s[0]\n" "cbnz x27, 90b\n" "91:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1184,33 +1184,33 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 92f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v21.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v21.4s\n" + "fmin v9.4s, v9.4s, v21.4s\n" + "fmin v10.4s, v10.4s, v21.4s\n" + "fmin v11.4s, v11.4s, v21.4s\n" + "fmin v12.4s, v12.4s, v21.4s\n" + "fmin v13.4s, v13.4s, v21.4s\n" + "fmin v14.4s, v14.4s, v21.4s\n" + "fmin v15.4s, v15.4s, v21.4s\n" + "fmin v16.4s, v16.4s, v21.4s\n" + "fmin v17.4s, v17.4s, v21.4s\n" + "fmin v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "92:" // Height 3: No activation "cmp x14, #0x10\n" "bge 101f\n" @@ -1504,14 +1504,14 @@ void a64_ffhybrid_fp32_mla_6x16 ( "118:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 119f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 120f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1521,9 +1521,9 @@ void a64_ffhybrid_fp32_mla_6x16 ( "b 120f\n" "119:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "120:" // Height 4: input setup done "cmp x27, #0x4\n" "blt 123f\n" @@ -1542,7 +1542,7 @@ void a64_ffhybrid_fp32_mla_6x16 ( "cmp x27, #0x8\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q25, [x10, #0x0]\n" "add x26, x26, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -1550,84 +1550,84 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x24, x24, #0x10\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "ldr q24, [x9, #0x0]\n" "add x23, x23, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x12, #0x30]\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr q25, [x12, #0x10]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr q24, [x11, #0x10]\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x10]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x9, #0x10]\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x12, #0x20]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x11, #0x20]\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x9, #0x20]\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x12, #0x30]\n" "add x12, x12, #0x40\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x10, #0x30]\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr q25, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr q24, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 121b\n" @@ -1638,7 +1638,7 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x26, x26, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q25, [x10, #0x0]\n" "add x25, x25, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -1646,111 +1646,111 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x23, x23, #0x10\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x12, #0x30]\n" + "ldr q24, [x9, #0x0]\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr q25, [x12, #0x10]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr q24, [x11, #0x10]\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x10]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x9, #0x10]\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x12, #0x20]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x11, #0x20]\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x9, #0x20]\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x12, #0x30]\n" "add x12, x12, #0x40\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x10, #0x30]\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr q25, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr q24, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" "123:" // Height 4: Multiply loop: Main loop skip "cbz x27, 125f\n" "124:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x12, #0x0]\n" + "ldr q24, [x11, #0x0]\n" + "fmla v8.4s, v25.4s, v29.s[0]\n" + "fmla v12.4s, v25.4s, v28.s[0]\n" + "fmla v16.4s, v25.4s, v27.s[0]\n" + "fmla v20.4s, v25.4s, v26.s[0]\n" + "ldr q25, [x10, #0x0]\n" "add x12, x12, #0x10\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v9.4s, v24.4s, v29.s[0]\n" + "fmla v13.4s, v24.4s, v28.s[0]\n" "add x11, x11, #0x10\n" "add x10, x10, #0x10\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "fmla v17.4s, v24.4s, v27.s[0]\n" + "fmla v21.4s, v24.4s, v26.s[0]\n" + "ldr q24, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v10.4s, v25.4s, v29.s[0]\n" + "fmla v14.4s, v25.4s, v28.s[0]\n" + "fmla v18.4s, v25.4s, v27.s[0]\n" + "fmla v22.4s, v25.4s, v26.s[0]\n" + "fmla v11.4s, v24.4s, v29.s[0]\n" + "fmla v15.4s, v24.4s, v28.s[0]\n" + "fmla v19.4s, v24.4s, v27.s[0]\n" + "fmla v23.4s, v24.4s, v26.s[0]\n" "cbnz x27, 124b\n" "125:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1763,41 +1763,41 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 126f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v25.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" + "ld1r { v24.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v25.4s\n" + "fmin v9.4s, v9.4s, v25.4s\n" + "fmin v10.4s, v10.4s, v25.4s\n" + "fmin v11.4s, v11.4s, v25.4s\n" + "fmin v12.4s, v12.4s, v25.4s\n" + "fmin v13.4s, v13.4s, v25.4s\n" + "fmin v14.4s, v14.4s, v25.4s\n" + "fmin v15.4s, v15.4s, v25.4s\n" + "fmin v16.4s, v16.4s, v25.4s\n" + "fmin v17.4s, v17.4s, v25.4s\n" + "fmin v18.4s, v18.4s, v25.4s\n" + "fmin v19.4s, v19.4s, v25.4s\n" + "fmin v20.4s, v20.4s, v25.4s\n" + "fmin v21.4s, v21.4s, v25.4s\n" + "fmin v22.4s, v22.4s, v25.4s\n" + "fmin v23.4s, v23.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v24.4s\n" + "fmax v9.4s, v9.4s, v24.4s\n" + "fmax v10.4s, v10.4s, v24.4s\n" + "fmax v11.4s, v11.4s, v24.4s\n" + "fmax v12.4s, v12.4s, v24.4s\n" + "fmax v13.4s, v13.4s, v24.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "fmax v23.4s, v23.4s, v24.4s\n" "126:" // Height 4: No activation "cmp x14, #0x10\n" "bge 135f\n" @@ -2137,155 +2137,155 @@ void a64_ffhybrid_fp32_mla_6x16 ( "movi v27.16b, #0x0\n" "151:" // Height 5: setup done "mov x28, #0x0\n" - "152:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 153f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 154f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x22, x22, x20, LSL #2\n" - "b 154f\n" - "153:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "154:" // Height 5: input setup done - "cmp x27, #0x4\n" - "blt 157f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "blt 156f\n" - "155:" // Height 5: Multiply loop: Main loop head - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "cmp x27, #0x8\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x24, x24, #0x10\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" + "152:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 153f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 154f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #2\n" + "add x25, x25, x20, LSL #2\n" + "add x24, x24, x20, LSL #2\n" + "add x23, x23, x20, LSL #2\n" + "add x22, x22, x20, LSL #2\n" + "b 154f\n" + "153:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "154:" // Height 5: input setup done + "cmp x27, #0x4\n" + "blt 157f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x8\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x12, #0x0]\n" + "ldr q7, [x11, #0x0]\n" + "blt 156f\n" + "155:" // Height 5: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x27, x27, #0x4\n" + "cmp x27, #0x8\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "ldr q29, [x10, #0x0]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x24, x24, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q28, [x9, #0x0]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr q29, [x12, #0x10]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr q28, [x11, #0x10]\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x10]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x9, #0x10]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x12, #0x20]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x11, #0x20]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x9, #0x20]\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x12, #0x30]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" "add x12, x12, #0x40\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr q29, [x10, #0x30]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" "add x10, x10, #0x40\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr q28, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" "ldr q6, [x12, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" "ldr q3, [x23, #0x0]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x11, #0x0]\n" "bge 155b\n" @@ -2299,7 +2299,7 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "ldr q29, [x10, #0x0]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x23, x23, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -2307,130 +2307,130 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x22, x22, #0x10\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x9, #0x0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x12, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x11, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x9, #0x10]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x12, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x11, #0x20]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x9, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x12, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" + "ldr q28, [x9, #0x0]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr q29, [x12, #0x10]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr q28, [x11, #0x10]\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x10]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x9, #0x10]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x12, #0x20]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x11, #0x20]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x9, #0x20]\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x12, #0x30]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" "add x12, x12, #0x40\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x11, #0x30]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x11, #0x30]\n" "add x11, x11, #0x40\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x10, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr q29, [x10, #0x30]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" "add x10, x10, #0x40\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x9, #0x30]\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr q28, [x9, #0x30]\n" "add x9, x9, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" "157:" // Height 5: Multiply loop: Main loop skip "cbz x27, 159f\n" "158:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x12, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x12, #0x0]\n" + "fmla v8.4s, v29.4s, v2.s[0]\n" + "fmla v12.4s, v29.4s, v1.s[0]\n" + "ldr q28, [x11, #0x0]\n" + "fmla v16.4s, v29.4s, v0.s[0]\n" + "fmla v20.4s, v29.4s, v31.s[0]\n" "add x12, x12, #0x10\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v24.4s, v29.4s, v30.s[0]\n" + "ldr q29, [x10, #0x0]\n" + "fmla v9.4s, v28.4s, v2.s[0]\n" "add x11, x11, #0x10\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v17.4s, v28.4s, v0.s[0]\n" "add x10, x10, #0x10\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "fmla v21.4s, v28.4s, v31.s[0]\n" + "fmla v25.4s, v28.4s, v30.s[0]\n" + "ldr q28, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v10.4s, v29.4s, v2.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v18.4s, v29.4s, v0.s[0]\n" + "fmla v22.4s, v29.4s, v31.s[0]\n" + "fmla v26.4s, v29.4s, v30.s[0]\n" + "fmla v11.4s, v28.4s, v2.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v0.s[0]\n" + "fmla v23.4s, v28.4s, v31.s[0]\n" + "fmla v27.4s, v28.4s, v30.s[0]\n" "cbnz x27, 158b\n" "159:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2444,49 +2444,49 @@ void a64_ffhybrid_fp32_mla_6x16 ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 160f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v29.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmin v26.4s, v26.4s, v1.4s\n" - "fmin v27.4s, v27.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" + "ld1r { v28.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v29.4s\n" + "fmin v9.4s, v9.4s, v29.4s\n" + "fmin v10.4s, v10.4s, v29.4s\n" + "fmin v11.4s, v11.4s, v29.4s\n" + "fmin v12.4s, v12.4s, v29.4s\n" + "fmin v13.4s, v13.4s, v29.4s\n" + "fmin v14.4s, v14.4s, v29.4s\n" + "fmin v15.4s, v15.4s, v29.4s\n" + "fmin v16.4s, v16.4s, v29.4s\n" + "fmin v17.4s, v17.4s, v29.4s\n" + "fmin v18.4s, v18.4s, v29.4s\n" + "fmin v19.4s, v19.4s, v29.4s\n" + "fmin v20.4s, v20.4s, v29.4s\n" + "fmin v21.4s, v21.4s, v29.4s\n" + "fmin v22.4s, v22.4s, v29.4s\n" + "fmin v23.4s, v23.4s, v29.4s\n" + "fmin v24.4s, v24.4s, v29.4s\n" + "fmin v25.4s, v25.4s, v29.4s\n" + "fmin v26.4s, v26.4s, v29.4s\n" + "fmin v27.4s, v27.4s, v29.4s\n" + "fmax v8.4s, v8.4s, v28.4s\n" + "fmax v9.4s, v9.4s, v28.4s\n" + "fmax v10.4s, v10.4s, v28.4s\n" + "fmax v11.4s, v11.4s, v28.4s\n" + "fmax v12.4s, v12.4s, v28.4s\n" + "fmax v13.4s, v13.4s, v28.4s\n" + "fmax v14.4s, v14.4s, v28.4s\n" + "fmax v15.4s, v15.4s, v28.4s\n" + "fmax v16.4s, v16.4s, v28.4s\n" + "fmax v17.4s, v17.4s, v28.4s\n" + "fmax v18.4s, v18.4s, v28.4s\n" + "fmax v19.4s, v19.4s, v28.4s\n" + "fmax v20.4s, v20.4s, v28.4s\n" + "fmax v21.4s, v21.4s, v28.4s\n" + "fmax v22.4s, v22.4s, v28.4s\n" + "fmax v23.4s, v23.4s, v28.4s\n" + "fmax v24.4s, v24.4s, v28.4s\n" + "fmax v25.4s, v25.4s, v28.4s\n" + "fmax v26.4s, v26.4s, v28.4s\n" + "fmax v27.4s, v27.4s, v28.4s\n" "160:" // Height 5: No activation "cmp x14, #0x10\n" "bge 169f\n" @@ -2881,16 +2881,16 @@ void a64_ffhybrid_fp32_mla_6x16 ( "186:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 187f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 188f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -2902,11 +2902,11 @@ void a64_ffhybrid_fp32_mla_6x16 ( "b 188f\n" "187:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "188:" // Height 6: input setup done "cmp x27, #0x4\n" "blt 191f\n" @@ -3177,45 +3177,45 @@ void a64_ffhybrid_fp32_mla_6x16 ( "191:" // Height 6: Multiply loop: Main loop skip "cbz x27, 193f\n" "192:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x12, #0x0]\n" - "ldr q7, [x11, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x12, #0x0]\n" + "ldr q0, [x11, #0x0]\n" + "fmla v8.4s, v1.4s, v7.s[0]\n" + "fmla v12.4s, v1.4s, v6.s[0]\n" + "fmla v16.4s, v1.4s, v5.s[0]\n" + "fmla v20.4s, v1.4s, v4.s[0]\n" "add x12, x12, #0x10\n" "add x11, x11, #0x10\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "fmla v28.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x10, #0x0]\n" + "fmla v24.4s, v1.4s, v3.s[0]\n" + "fmla v28.4s, v1.4s, v2.s[0]\n" + "ldr q1, [x10, #0x0]\n" "add x10, x10, #0x10\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x9, #0x0]\n" + "fmla v9.4s, v0.4s, v7.s[0]\n" + "fmla v13.4s, v0.4s, v6.s[0]\n" + "fmla v17.4s, v0.4s, v5.s[0]\n" + "fmla v21.4s, v0.4s, v4.s[0]\n" + "fmla v25.4s, v0.4s, v3.s[0]\n" + "fmla v29.4s, v0.4s, v2.s[0]\n" + "ldr q0, [x9, #0x0]\n" "add x9, x9, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v30.4s, v6.4s, v5.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "fmla v31.4s, v7.4s, v5.s[0]\n" + "fmla v10.4s, v1.4s, v7.s[0]\n" + "fmla v14.4s, v1.4s, v6.s[0]\n" + "fmla v18.4s, v1.4s, v5.s[0]\n" + "fmla v22.4s, v1.4s, v4.s[0]\n" + "fmla v26.4s, v1.4s, v3.s[0]\n" + "fmla v30.4s, v1.4s, v2.s[0]\n" + "fmla v11.4s, v0.4s, v7.s[0]\n" + "fmla v15.4s, v0.4s, v6.s[0]\n" + "fmla v19.4s, v0.4s, v5.s[0]\n" + "fmla v23.4s, v0.4s, v4.s[0]\n" + "fmla v27.4s, v0.4s, v3.s[0]\n" + "fmla v31.4s, v0.4s, v2.s[0]\n" "cbnz x27, 192b\n" "193:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp index af2c1e5ae0..923d008bb1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp index 1f707fa962..8961e615d7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffhybrid_fp32bf16fp32_mmla_4x24/generic.cpp @@ -283,11 +283,11 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "21:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 22f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -307,32 +307,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "24:" // Height 1: Multiply loop: Main loop head ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q24, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q23, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + "ldr q22, [x9, #0x0]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q21, [x9, #0x10]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + "ldr q24, [x28, #0x0]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x10]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x27, #0x0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x27, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x8\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" "add x12, x12, #0x20\n" "ldr q4, [x12, #0x0]\n" "add x11, x11, #0x20\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "ldr q5, [x12, #0x10]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "ld1 { v0.4s }, [x24], #0x10\n" "ldr q7, [x11, #0x10]\n" "add x10, x10, #0x20\n" @@ -343,28 +343,28 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "25:" // Height 1: Multiply loop: Single iteration only ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q22, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q25, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + "ldr q21, [x9, #0x0]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q24, [x9, #0x10]\n" + ".inst 0x6e56ec0a // bfmmla v10.4s, v0.8h, v22.8h\n" + "ldr q23, [x28, #0x0]\n" + ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n" + "ldr q22, [x28, #0x10]\n" + ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n" + "ldr q21, [x27, #0x0]\n" + ".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n" + "ldr q3, [x27, #0x10]\n" "sub x25, x25, #0x4\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n" + ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n" "add x10, x10, #0x20\n" "add x9, x9, #0x20\n" "add x28, x28, #0x20\n" @@ -380,31 +380,31 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "27:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr s0, [x24, #0x0]\n" "28:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q4, [x12, #0x0]\n" - "ldr q5, [x12, #0x10]\n" + "ldr q21, [x12, #0x0]\n" + "ldr q30, [x12, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q6, [x11, #0x0]\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q4, [x10, #0x0]\n" - "ldr q5, [x10, #0x10]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - "ldr q6, [x27, #0x0]\n" - "ldr q7, [x27, #0x10]\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n" + "ldr q21, [x11, #0x0]\n" + "ldr q22, [x11, #0x10]\n" + ".inst 0x6e5eec0e // bfmmla v14.4s, v0.8h, v30.8h\n" + ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n" + "ldr q21, [x10, #0x0]\n" + "ldr q23, [x10, #0x10]\n" + ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n" + "ldr q21, [x9, #0x0]\n" + "ldr q22, [x9, #0x10]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0x0]\n" + "ldr q23, [x28, #0x10]\n" + ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n" + "ldr q22, [x27, #0x0]\n" + "ldr q21, [x27, #0x10]\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -424,21 +424,21 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "uzp1 v13.2d, v13.2d, v19.2d\n" "tbz %x[flags], #1, 30f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v22.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v21.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v22.4s\n" + "fmin v9.4s, v9.4s, v22.4s\n" + "fmin v10.4s, v10.4s, v22.4s\n" + "fmin v11.4s, v11.4s, v22.4s\n" + "fmin v12.4s, v12.4s, v22.4s\n" + "fmin v13.4s, v13.4s, v22.4s\n" + "fmax v8.4s, v8.4s, v21.4s\n" + "fmax v9.4s, v9.4s, v21.4s\n" + "fmax v10.4s, v10.4s, v21.4s\n" + "fmax v11.4s, v11.4s, v21.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "fmax v13.4s, v13.4s, v21.4s\n" "30:" // Height 1: No activation "cmp x14, #0x18\n" "bge 43f\n" @@ -744,12 +744,12 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "65:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 66f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 67f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -757,7 +757,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "b 67f\n" "66:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "67:" // Height 2: input setup done "cmp x25, #0x4\n" "blt 70f\n" @@ -774,32 +774,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" "ld1 { v1.4s }, [x23], #0x10\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q30, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q23, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + "ldr q22, [x9, #0x0]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q21, [x9, #0x10]\n" + ".inst 0x6e5eec0a // bfmmla v10.4s, v0.8h, v30.8h\n" + "ldr q2, [x28, #0x0]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x10]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x27, #0x0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x27, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x8\n" "add x12, x12, #0x20\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e42ec0c // bfmmla v12.4s, v0.8h, v2.8h\n" "ldr q4, [x12, #0x0]\n" "add x11, x11, #0x20\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "ldr q5, [x12, #0x10]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "ld1 { v0.4s }, [x24], #0x10\n" "add x10, x10, #0x20\n" "ldr q7, [x11, #0x10]\n" @@ -811,28 +811,28 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q24, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q23, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + "ldr q22, [x9, #0x0]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q21, [x9, #0x10]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + "ldr q24, [x28, #0x0]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x10]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x27, #0x0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x27, #0x10]\n" "sub x25, x25, #0x4\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x10, x10, #0x20\n" "add x9, x9, #0x20\n" "add x28, x28, #0x20\n" @@ -851,32 +851,32 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "ldr s0, [x24, #0x0]\n" "ldr s1, [x23, #0x0]\n" "72:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q4, [x12, #0x0]\n" - "ldr q5, [x12, #0x10]\n" + "ldr q24, [x12, #0x0]\n" + "ldr q23, [x12, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - "ldr q6, [x11, #0x0]\n" - "ldr q7, [x11, #0x10]\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q4, [x10, #0x0]\n" - "ldr q5, [x10, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q6, [x27, #0x0]\n" - "ldr q7, [x27, #0x10]\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ldr q22, [x11, #0x0]\n" + "ldr q21, [x11, #0x10]\n" + ".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n" + "ldr q24, [x10, #0x0]\n" + "ldr q23, [x10, #0x10]\n" + ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n" + "ldr q22, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q24, [x28, #0x0]\n" + "ldr q23, [x28, #0x10]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q22, [x27, #0x0]\n" + "ldr q21, [x27, #0x10]\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x12, x12, #0x20\n" "add x11, x11, #0x20\n" "add x10, x10, #0x20\n" @@ -904,33 +904,33 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "uzp2 v13.2d, v13.2d, v19.2d\n" "tbz %x[flags], #1, 74f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v22.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v4.4s, v4.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v21.4s }, [x20]\n" + "fmin v4.4s, v4.4s, v22.4s\n" + "fmin v14.4s, v14.4s, v22.4s\n" + "fmin v15.4s, v15.4s, v22.4s\n" + "fmin v16.4s, v16.4s, v22.4s\n" + "fmin v17.4s, v17.4s, v22.4s\n" + "fmin v18.4s, v18.4s, v22.4s\n" + "fmin v8.4s, v8.4s, v22.4s\n" + "fmin v9.4s, v9.4s, v22.4s\n" + "fmin v10.4s, v10.4s, v22.4s\n" + "fmin v11.4s, v11.4s, v22.4s\n" + "fmin v12.4s, v12.4s, v22.4s\n" + "fmin v13.4s, v13.4s, v22.4s\n" + "fmax v4.4s, v4.4s, v21.4s\n" + "fmax v14.4s, v14.4s, v21.4s\n" + "fmax v15.4s, v15.4s, v21.4s\n" + "fmax v16.4s, v16.4s, v21.4s\n" + "fmax v17.4s, v17.4s, v21.4s\n" + "fmax v18.4s, v18.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v21.4s\n" + "fmax v9.4s, v9.4s, v21.4s\n" + "fmax v10.4s, v10.4s, v21.4s\n" + "fmax v11.4s, v11.4s, v21.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "fmax v13.4s, v13.4s, v21.4s\n" "74:" // Height 2: No activation "cmp x14, #0x18\n" "bge 87f\n" @@ -1339,13 +1339,13 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "109:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 110f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 111f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1354,8 +1354,8 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "b 111f\n" "110:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "111:" // Height 3: input setup done "cmp x25, #0x4\n" "blt 114f\n" @@ -1386,7 +1386,7 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" "cmp x25, #0x8\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" + "ldr q3, [x9, #0x10]\n" ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" "add x12, x12, #0x20\n" ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" @@ -1399,10 +1399,10 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "add x10, x10, #0x20\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + ".inst 0x6e43ec5d // bfmmla v29.4s, v2.8h, v3.8h\n" + "ldr q3, [x27, #0x10]\n" "add x28, x28, #0x20\n" ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" "add x27, x27, #0x20\n" @@ -1414,9 +1414,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" "ldr q6, [x11, #0x0]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n" "ld1 { v0.4s }, [x24], #0x10\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec5f // bfmmla v31.4s, v2.8h, v3.8h\n" "ld1 { v2.4s }, [x22], #0x10\n" "ldr q7, [x11, #0x10]\n" "bge 112b\n" @@ -1427,10 +1427,10 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "sub x25, x25, #0x4\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q3, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" "add x12, x12, #0x20\n" ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" @@ -1438,31 +1438,31 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" "add x11, x11, #0x20\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q1, [x9, #0x10]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" "add x10, x10, #0x20\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" + ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n" + "ldr q5, [x28, #0x0]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x10]\n" ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" "add x28, x28, #0x20\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q3, [x27, #0x0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x27, #0x10]\n" "add x27, x27, #0x20\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "114:" // Height 3: Multiply loop: Main loop skip "cbz x25, 117f\n" "cbz x25, 117f\n" @@ -1480,51 +1480,51 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "ldr s1, [x23, #0x0]\n" "ldr s2, [x22, #0x0]\n" "116:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q4, [x12, #0x0]\n" - "ldr q5, [x12, #0x10]\n" + "ldr q5, [x12, #0x0]\n" + "ldr q4, [x12, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - "ldr q6, [x11, #0x0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q3, [x11, #0x0]\n" + "ldr q1, [x11, #0x10]\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" + "ldr q5, [x10, #0x0]\n" + ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n" "add x12, x12, #0x20\n" - ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n" + "ldr q4, [x10, #0x10]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n" + "ldr q3, [x9, #0x0]\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n" + "ldr q1, [x9, #0x10]\n" + ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x0]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x10]\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" "add x28, x28, #0x20\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + "ldr q3, [x27, #0x0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x27, #0x10]\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" "add x27, x27, #0x20\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "117:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -2070,14 +2070,14 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "153:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 154f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 155f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -2087,9 +2087,9 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "b 155f\n" "154:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "155:" // Height 4: input setup done "cmp x25, #0x4\n" "blt 158f\n" @@ -2167,40 +2167,40 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" "add x11, x11, #0x20\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + "ldr q3, [x10, #0x0]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" "add x10, x10, #0x20\n" ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" "ldr q6, [x9, #0x0]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q1, [x9, #0x10]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" "add x9, x9, #0x20\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" + ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n" + "ldr q5, [x28, #0x0]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x10]\n" ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" "add x28, x28, #0x20\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + "ldr q3, [x27, #0x0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x27, #0x10]\n" "add x27, x27, #0x20\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "158:" // Height 4: Multiply loop: Main loop skip "cbz x25, 161f\n" "cbz x25, 161f\n" @@ -2221,52 +2221,52 @@ void a64_ffhybrid_fp32bf16fp32_mmla_4x24 ( "ldr s2, [x22, #0x0]\n" "ldr s3, [x21, #0x0]\n" "160:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q4, [x12, #0x0]\n" - "ldr q5, [x12, #0x10]\n" + "ldr q5, [x12, #0x0]\n" + "ldr q4, [x12, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" - "ldr q6, [x11, #0x0]\n" - "ldr q7, [x11, #0x10]\n" + "ldr q7, [x11, #0x0]\n" + "ldr q6, [x11, #0x10]\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x10, #0x0]\n" + ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" + "ldr q5, [x10, #0x0]\n" "add x12, x12, #0x20\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x10, #0x10]\n" + ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n" + "ldr q4, [x10, #0x10]\n" "add x11, x11, #0x20\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x9, #0x0]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q3, [x9, #0x0]\n" "add x10, x10, #0x20\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x9, #0x10]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + "ldr q1, [x9, #0x10]\n" "add x9, x9, #0x20\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x10]\n" + ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x0]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x10]\n" "add x28, x28, #0x20\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x27, #0x0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x27, #0x10]\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + "ldr q3, [x27, #0x0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x27, #0x10]\n" "add x27, x27, #0x20\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "161:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp index e24dab68e8..745f89eff6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp index 2458d6a035..5f4fcac690 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_dot_8x12/generic.cpp @@ -52,29 +52,29 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( __asm__ __volatile__( "1:" // Height loop - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x24, [%x[args_ptr], %[offsetof_N]]\n" - "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x23, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x24, %x[Apanel]\n" "2:" // Width loop - "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" - "add x22, x25, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "cmp x24, #0x8\n" - "mov %x[Apanel], x23\n" + "cmp x25, #0x8\n" + "mov %x[Apanel], x24\n" "bgt 3f\n" - "cmp x24, #0x4\n" - "mov x21, x25\n" + "cmp x25, #0x4\n" + "mov x21, x23\n" "bgt 3f\n" - "mov x22, x25\n" + "mov x22, x23\n" "3:" // B setup done "ldr q0, [%x[Apanel], #0x0]\n" "ldr q1, [%x[Apanel], #0x10]\n" "movi v8.16b, #0x0\n" - "ldr q4, [x25, #0x0]\n" + "ldr q4, [x23, #0x0]\n" "ldr q5, [x22, #0x0]\n" "movi v9.16b, #0x0\n" "ldr q6, [x21, #0x0]\n" @@ -104,8 +104,8 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( "movi v31.16b, #0x0\n" "blt 5f\n" "4:" // main loop head - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" + "ldr q3, [%x[Apanel], #0x20]\n" + "ldr q7, [%x[Apanel], #0x30]\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" ".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n" ".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n" @@ -117,11 +117,11 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( ".inst 0x4f41f89a // bfdot v26.4s, v4.8h, v1.h[2]\n" "add %x[Apanel], %x[Apanel], #0x40\n" ".inst 0x4f61f89d // bfdot v29.4s, v4.8h, v1.h[3]\n" - "ldr q4, [x25, #0x10]\n" + "ldr q4, [x23, #0x10]\n" ".inst 0x4f40f0a9 // bfdot v9.4s, v5.8h, v0.h[0]\n" ".inst 0x4f60f0ac // bfdot v12.4s, v5.8h, v0.h[1]\n" ".inst 0x4f40f8af // bfdot v15.4s, v5.8h, v0.h[2]\n" - "add x25, x25, #0x20\n" + "add x23, x23, #0x20\n" ".inst 0x4f60f8b2 // bfdot v18.4s, v5.8h, v0.h[3]\n" ".inst 0x4f41f0b5 // bfdot v21.4s, v5.8h, v1.h[0]\n" ".inst 0x4f61f0b8 // bfdot v24.4s, v5.8h, v1.h[1]\n" @@ -138,35 +138,35 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( ".inst 0x4f61f0d9 // bfdot v25.4s, v6.8h, v1.h[1]\n" ".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x21, #0x10]\n" + "ldr q2, [x21, #0x10]\n" "ldr q1, [%x[Apanel], #0x10]\n" "add x21, x21, #0x20\n" - ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n" - ".inst 0x4f62f08b // bfdot v11.4s, v4.8h, v2.h[1]\n" - ".inst 0x4f42f88e // bfdot v14.4s, v4.8h, v2.h[2]\n" - ".inst 0x4f62f891 // bfdot v17.4s, v4.8h, v2.h[3]\n" - ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" - ".inst 0x4f63f097 // bfdot v23.4s, v4.8h, v3.h[1]\n" - ".inst 0x4f43f89a // bfdot v26.4s, v4.8h, v3.h[2]\n" - ".inst 0x4f63f89d // bfdot v29.4s, v4.8h, v3.h[3]\n" - "ldr q4, [x25, #0x0]\n" - ".inst 0x4f42f0a9 // bfdot v9.4s, v5.8h, v2.h[0]\n" - ".inst 0x4f62f0ac // bfdot v12.4s, v5.8h, v2.h[1]\n" - ".inst 0x4f42f8af // bfdot v15.4s, v5.8h, v2.h[2]\n" - ".inst 0x4f62f8b2 // bfdot v18.4s, v5.8h, v2.h[3]\n" - ".inst 0x4f43f0b5 // bfdot v21.4s, v5.8h, v3.h[0]\n" - ".inst 0x4f63f0b8 // bfdot v24.4s, v5.8h, v3.h[1]\n" - ".inst 0x4f43f8bb // bfdot v27.4s, v5.8h, v3.h[2]\n" - ".inst 0x4f63f8be // bfdot v30.4s, v5.8h, v3.h[3]\n" + ".inst 0x4f43f088 // bfdot v8.4s, v4.8h, v3.h[0]\n" + ".inst 0x4f63f08b // bfdot v11.4s, v4.8h, v3.h[1]\n" + ".inst 0x4f43f88e // bfdot v14.4s, v4.8h, v3.h[2]\n" + ".inst 0x4f63f891 // bfdot v17.4s, v4.8h, v3.h[3]\n" + ".inst 0x4f47f094 // bfdot v20.4s, v4.8h, v7.h[0]\n" + ".inst 0x4f67f097 // bfdot v23.4s, v4.8h, v7.h[1]\n" + ".inst 0x4f47f89a // bfdot v26.4s, v4.8h, v7.h[2]\n" + ".inst 0x4f67f89d // bfdot v29.4s, v4.8h, v7.h[3]\n" + "ldr q4, [x23, #0x0]\n" + ".inst 0x4f43f0a9 // bfdot v9.4s, v5.8h, v3.h[0]\n" + ".inst 0x4f63f0ac // bfdot v12.4s, v5.8h, v3.h[1]\n" + ".inst 0x4f43f8af // bfdot v15.4s, v5.8h, v3.h[2]\n" + ".inst 0x4f63f8b2 // bfdot v18.4s, v5.8h, v3.h[3]\n" + ".inst 0x4f47f0b5 // bfdot v21.4s, v5.8h, v7.h[0]\n" + ".inst 0x4f67f0b8 // bfdot v24.4s, v5.8h, v7.h[1]\n" + ".inst 0x4f47f8bb // bfdot v27.4s, v5.8h, v7.h[2]\n" + ".inst 0x4f67f8be // bfdot v30.4s, v5.8h, v7.h[3]\n" "ldr q5, [x22, #0x0]\n" - ".inst 0x4f42f0ca // bfdot v10.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f62f0cd // bfdot v13.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f63f0d9 // bfdot v25.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f43f8dc // bfdot v28.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f43f04a // bfdot v10.4s, v2.8h, v3.h[0]\n" + ".inst 0x4f63f04d // bfdot v13.4s, v2.8h, v3.h[1]\n" + ".inst 0x4f43f850 // bfdot v16.4s, v2.8h, v3.h[2]\n" + ".inst 0x4f63f853 // bfdot v19.4s, v2.8h, v3.h[3]\n" + ".inst 0x4f47f056 // bfdot v22.4s, v2.8h, v7.h[0]\n" + ".inst 0x4f67f059 // bfdot v25.4s, v2.8h, v7.h[1]\n" + ".inst 0x4f47f85c // bfdot v28.4s, v2.8h, v7.h[2]\n" + ".inst 0x4f67f85f // bfdot v31.4s, v2.8h, v7.h[3]\n" "ldr q6, [x21, #0x0]\n" "bge 4b\n" "5:" // main loop skip @@ -175,7 +175,7 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( "add %x[Apanel], %x[Apanel], #0x20\n" ".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n" ".inst 0x4f60f891 // bfdot v17.4s, v4.8h, v0.h[3]\n" - "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f097 // bfdot v23.4s, v4.8h, v1.h[1]\n" "add x22, x22, #0x10\n" @@ -199,38 +199,38 @@ void a64_ffinterleaved_bf16fp32_dot_8x12( ".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" "cbz x20, 6f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [%x[Apanel], #0x0]\n" + "ldr q3, [%x[Apanel], #0x10]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x25, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n" - "ldr q5, [x21, #0x0]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f40f8ee // bfdot v14.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f60f8f1 // bfdot v17.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f41f0f4 // bfdot v20.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f61f0f7 // bfdot v23.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f41f8fa // bfdot v26.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f61f8fd // bfdot v29.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f40f089 // bfdot v9.4s, v4.8h, v0.h[0]\n" - ".inst 0x4f60f08c // bfdot v12.4s, v4.8h, v0.h[1]\n" - ".inst 0x4f40f88f // bfdot v15.4s, v4.8h, v0.h[2]\n" - ".inst 0x4f60f892 // bfdot v18.4s, v4.8h, v0.h[3]\n" - ".inst 0x4f41f095 // bfdot v21.4s, v4.8h, v1.h[0]\n" - ".inst 0x4f61f098 // bfdot v24.4s, v4.8h, v1.h[1]\n" - ".inst 0x4f41f89b // bfdot v27.4s, v4.8h, v1.h[2]\n" - ".inst 0x4f61f89e // bfdot v30.4s, v4.8h, v1.h[3]\n" - ".inst 0x4f40f0aa // bfdot v10.4s, v5.8h, v0.h[0]\n" - ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n" - ".inst 0x4f40f8b0 // bfdot v16.4s, v5.8h, v0.h[2]\n" - ".inst 0x4f60f8b3 // bfdot v19.4s, v5.8h, v0.h[3]\n" - ".inst 0x4f41f0b6 // bfdot v22.4s, v5.8h, v1.h[0]\n" - ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n" - ".inst 0x4f41f8bc // bfdot v28.4s, v5.8h, v1.h[2]\n" - ".inst 0x4f61f8bf // bfdot v31.4s, v5.8h, v1.h[3]\n" + "ldr q2, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + ".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n" + "ldr q0, [x21, #0x0]\n" + ".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n" + ".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n" + ".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n" + ".inst 0x4f43f054 // bfdot v20.4s, v2.8h, v3.h[0]\n" + ".inst 0x4f63f057 // bfdot v23.4s, v2.8h, v3.h[1]\n" + ".inst 0x4f43f85a // bfdot v26.4s, v2.8h, v3.h[2]\n" + ".inst 0x4f63f85d // bfdot v29.4s, v2.8h, v3.h[3]\n" + ".inst 0x4f44f029 // bfdot v9.4s, v1.8h, v4.h[0]\n" + ".inst 0x4f64f02c // bfdot v12.4s, v1.8h, v4.h[1]\n" + ".inst 0x4f44f82f // bfdot v15.4s, v1.8h, v4.h[2]\n" + ".inst 0x4f64f832 // bfdot v18.4s, v1.8h, v4.h[3]\n" + ".inst 0x4f43f035 // bfdot v21.4s, v1.8h, v3.h[0]\n" + ".inst 0x4f63f038 // bfdot v24.4s, v1.8h, v3.h[1]\n" + ".inst 0x4f43f83b // bfdot v27.4s, v1.8h, v3.h[2]\n" + ".inst 0x4f63f83e // bfdot v30.4s, v1.8h, v3.h[3]\n" + ".inst 0x4f44f00a // bfdot v10.4s, v0.8h, v4.h[0]\n" + ".inst 0x4f64f00d // bfdot v13.4s, v0.8h, v4.h[1]\n" + ".inst 0x4f44f810 // bfdot v16.4s, v0.8h, v4.h[2]\n" + ".inst 0x4f64f813 // bfdot v19.4s, v0.8h, v4.h[3]\n" + ".inst 0x4f43f016 // bfdot v22.4s, v0.8h, v3.h[0]\n" + ".inst 0x4f63f019 // bfdot v25.4s, v0.8h, v3.h[1]\n" + ".inst 0x4f43f81c // bfdot v28.4s, v0.8h, v3.h[2]\n" + ".inst 0x4f63f81f // bfdot v31.4s, v0.8h, v3.h[3]\n" "6:" // multiply loop done - "subs x24, x24, #0xc\n" + "subs x25, x25, #0xc\n" "str q8, [%x[Cpanel], #0x0]\n" "str q9, [%x[Cpanel], #0x10]\n" "str q10, [%x[Cpanel], #0x20]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp index c61315b80a..cf4d74266a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp index 47991114af..4a1c1b5638 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_bf16fp32_mmla_8x12/generic.cpp @@ -52,37 +52,37 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( __asm__ __volatile__( "1:" // Height loop - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x24, [%x[args_ptr], %[offsetof_N]]\n" - "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x23, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x24, %x[Apanel]\n" "2:" // Width loop - "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" - "add x22, x25, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "cmp x24, #0x8\n" - "mov %x[Apanel], x23\n" + "cmp x25, #0x8\n" + "mov %x[Apanel], x24\n" "bgt 3f\n" - "cmp x24, #0x4\n" - "mov x21, x25\n" + "cmp x25, #0x4\n" + "mov x21, x23\n" "bgt 3f\n" - "mov x22, x25\n" + "mov x22, x23\n" "3:" // B setup done - "ldr q4, [x25, #0x0]\n" + "ldr q4, [x23, #0x0]\n" "ldr q0, [%x[Apanel], #0x0]\n" "movi v8.16b, #0x0\n" "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q5, [x25, #0x10]\n" + "ldr q5, [x23, #0x10]\n" "movi v9.16b, #0x0\n" "ldr q2, [%x[Apanel], #0x20]\n" "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" "movi v10.16b, #0x0\n" "movi v11.16b, #0x0\n" - "add x25, x25, #0x20\n" + "add x23, x23, #0x20\n" "movi v12.16b, #0x0\n" "movi v13.16b, #0x0\n" "add %x[Apanel], %x[Apanel], #0x30\n" @@ -106,31 +106,31 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( "movi v31.16b, #0x0\n" "blt 5f\n" "4:" // main loop head - "ldr q3, [%x[Apanel], #0x0]\n" - "ldr q6, [x22, #0x0]\n" + "ldr q6, [%x[Apanel], #0x0]\n" + "ldr q7, [x22, #0x0]\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q7, [x22, #0x10]\n" + "ldr q3, [x22, #0x10]\n" ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" "sub x20, x20, #0x2\n" ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n" "ldr q4, [x21, #0x0]\n" - ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" + ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n" "ldr q5, [x21, #0x10]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0c // bfmmla v12.4s, v0.8h, v3.8h\n" + ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n" "cmp x20, #0x2\n" - ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" - "ldr q6, [x25, #0x0]\n" - ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" - "ldr q7, [x25, #0x10]\n" + ".inst 0x6e43ec32 // bfmmla v18.4s, v1.8h, v3.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec58 // bfmmla v24.4s, v2.8h, v3.8h\n" + ".inst 0x6e47ecdb // bfmmla v27.4s, v6.8h, v7.8h\n" + "ldr q7, [x23, #0x0]\n" + ".inst 0x6e43ecde // bfmmla v30.4s, v6.8h, v3.8h\n" + "ldr q3, [x23, #0x10]\n" ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" "ldr q0, [%x[Apanel], #0x10]\n" @@ -140,22 +140,22 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" "ldr q2, [%x[Apanel], #0x30]\n" - ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n" "ldr q4, [x22, #0x20]\n" - ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" - "ldr q3, [%x[Apanel], #0x40]\n" + ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n" + "ldr q6, [%x[Apanel], #0x40]\n" "ldr q5, [x22, #0x30]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n" "add x22, x22, #0x40\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - "ldr q6, [x21, #0x20]\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldr q7, [x21, #0x30]\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n" + "ldr q7, [x21, #0x20]\n" + ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n" + "ldr q3, [x21, #0x30]\n" ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" @@ -163,23 +163,23 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( "add x21, x21, #0x40\n" ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" - "ldr q4, [x25, #0x20]\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" - "ldr q5, [x25, #0x30]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n" + "ldr q4, [x23, #0x20]\n" + ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n" + "ldr q5, [x23, #0x30]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" "ldr q0, [%x[Apanel], #0x50]\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n" "ldr q1, [%x[Apanel], #0x60]\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" "ldr q2, [%x[Apanel], #0x70]\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n" + ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n" "add %x[Apanel], %x[Apanel], #0x80\n" - "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" "bge 4b\n" "5:" // main loop skip "ldr q3, [%x[Apanel], #0x0]\n" @@ -215,88 +215,88 @@ void a64_ffinterleaved_bf16fp32_mmla_8x12( ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" "cbz x20, 6f\n" - "ldr q6, [x25, #0x0]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q7, [x25, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q4, [x22, #0x0]\n" - "ldr q5, [x22, #0x10]\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "ldr q1, [x23, #0x0]\n" + "ldr q7, [%x[Apanel], #0x0]\n" + ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n" + "ldr q6, [%x[Apanel], #0x10]\n" + "ldr q0, [x23, #0x10]\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" + "ldr q5, [%x[Apanel], #0x20]\n" + "ldr q4, [%x[Apanel], #0x30]\n" + ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n" + "ldr q3, [x22, #0x0]\n" + "ldr q2, [x22, #0x10]\n" + ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - "ldr q6, [x21, #0x0]\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldr q7, [x21, #0x10]\n" - ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" - ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" - ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" - ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n" + "ldr q1, [x21, #0x0]\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x21, #0x10]\n" + ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n" + ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n" + ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n" + ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n" + ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n" + ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n" + ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n" + ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n" + ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n" + ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" "6:" // multiply loop done - "subs x24, x24, #0xc\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "subs x25, x25, #0xc\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp index 1495306879..b9b4ad54df 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp index 36bfccf52f..1e3f2f300b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp16_mla_8x24/generic.cpp @@ -51,27 +51,27 @@ void a64_ffinterleaved_fp16_mla_8x24( __asm__ __volatile__( "1:" // Height loop - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x24, [%x[args_ptr], %[offsetof_N]]\n" - "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x23, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x24, %x[Apanel]\n" "2:" // Width loop - "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" - "add x22, x25, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "cmp x24, #0x10\n" - "mov %x[Apanel], x23\n" + "cmp x25, #0x10\n" + "mov %x[Apanel], x24\n" "bgt 3f\n" - "cmp x24, #0x8\n" - "mov x21, x25\n" + "cmp x25, #0x8\n" + "mov x21, x23\n" "bgt 3f\n" - "mov x22, x25\n" + "mov x22, x23\n" "3:" // B setup done "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q2, [x25, #0x0]\n" + "ldr q2, [x23, #0x0]\n" "movi v8.16b, #0x0\n" "ldr q3, [x22, #0x0]\n" "ldr q4, [x21, #0x0]\n" @@ -102,11 +102,11 @@ void a64_ffinterleaved_fp16_mla_8x24( "movi v31.16b, #0x0\n" "blt 5f\n" "4:" // main loop head - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q5, [x25, #0x10]\n" + "ldr q7, [%x[Apanel], #0x10]\n" + "ldr q6, [x23, #0x10]\n" "fmla v8.8h, v2.8h, v0.h[0]\n" - "ldr q6, [x22, #0x10]\n" - "ldr q7, [x21, #0x10]\n" + "ldr q5, [x22, #0x10]\n" + "ldr q1, [x21, #0x10]\n" "fmla v11.8h, v2.8h, v0.h[1]\n" "fmla v14.8h, v2.8h, v0.h[2]\n" "fmla v17.8h, v2.8h, v0.h[3]\n" @@ -119,8 +119,8 @@ void a64_ffinterleaved_fp16_mla_8x24( "add %x[Apanel], %x[Apanel], #0x20\n" "fmla v9.8h, v3.8h, v0.h[0]\n" "fmla v12.8h, v3.8h, v0.h[1]\n" - "add x25, x25, #0x20\n" - "ldr q2, [x25, #0x0]\n" + "add x23, x23, #0x20\n" + "ldr q2, [x23, #0x0]\n" "fmla v15.8h, v3.8h, v0.h[2]\n" "fmla v18.8h, v3.8h, v0.h[3]\n" "fmla v21.8h, v3.8h, v0.h[4]\n" @@ -140,30 +140,30 @@ void a64_ffinterleaved_fp16_mla_8x24( "fmla v31.8h, v4.8h, v0.h[7]\n" "ldr q0, [%x[Apanel], #0x0]\n" "ldr q4, [x21, #0x0]\n" - "fmla v8.8h, v5.8h, v1.h[0]\n" - "fmla v11.8h, v5.8h, v1.h[1]\n" - "fmla v14.8h, v5.8h, v1.h[2]\n" - "fmla v17.8h, v5.8h, v1.h[3]\n" - "fmla v20.8h, v5.8h, v1.h[4]\n" - "fmla v23.8h, v5.8h, v1.h[5]\n" - "fmla v26.8h, v5.8h, v1.h[6]\n" - "fmla v29.8h, v5.8h, v1.h[7]\n" - "fmla v9.8h, v6.8h, v1.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v15.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v1.h[3]\n" - "fmla v21.8h, v6.8h, v1.h[4]\n" - "fmla v24.8h, v6.8h, v1.h[5]\n" - "fmla v27.8h, v6.8h, v1.h[6]\n" - "fmla v30.8h, v6.8h, v1.h[7]\n" - "fmla v10.8h, v7.8h, v1.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v16.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v1.h[3]\n" - "fmla v22.8h, v7.8h, v1.h[4]\n" - "fmla v25.8h, v7.8h, v1.h[5]\n" - "fmla v28.8h, v7.8h, v1.h[6]\n" - "fmla v31.8h, v7.8h, v1.h[7]\n" + "fmla v8.8h, v6.8h, v7.h[0]\n" + "fmla v11.8h, v6.8h, v7.h[1]\n" + "fmla v14.8h, v6.8h, v7.h[2]\n" + "fmla v17.8h, v6.8h, v7.h[3]\n" + "fmla v20.8h, v6.8h, v7.h[4]\n" + "fmla v23.8h, v6.8h, v7.h[5]\n" + "fmla v26.8h, v6.8h, v7.h[6]\n" + "fmla v29.8h, v6.8h, v7.h[7]\n" + "fmla v9.8h, v5.8h, v7.h[0]\n" + "fmla v12.8h, v5.8h, v7.h[1]\n" + "fmla v15.8h, v5.8h, v7.h[2]\n" + "fmla v18.8h, v5.8h, v7.h[3]\n" + "fmla v21.8h, v5.8h, v7.h[4]\n" + "fmla v24.8h, v5.8h, v7.h[5]\n" + "fmla v27.8h, v5.8h, v7.h[6]\n" + "fmla v30.8h, v5.8h, v7.h[7]\n" + "fmla v10.8h, v1.8h, v7.h[0]\n" + "fmla v13.8h, v1.8h, v7.h[1]\n" + "fmla v16.8h, v1.8h, v7.h[2]\n" + "fmla v19.8h, v1.8h, v7.h[3]\n" + "fmla v22.8h, v1.8h, v7.h[4]\n" + "fmla v25.8h, v1.8h, v7.h[5]\n" + "fmla v28.8h, v1.8h, v7.h[6]\n" + "fmla v31.8h, v1.8h, v7.h[7]\n" "bge 4b\n" "5:" // main loop skip "fmla v8.8h, v2.8h, v0.h[0]\n" @@ -171,7 +171,7 @@ void a64_ffinterleaved_fp16_mla_8x24( "add %x[Apanel], %x[Apanel], #0x10\n" "fmla v14.8h, v2.8h, v0.h[2]\n" "fmla v17.8h, v2.8h, v0.h[3]\n" - "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" "fmla v20.8h, v2.8h, v0.h[4]\n" "fmla v23.8h, v2.8h, v0.h[5]\n" "add x22, x22, #0x10\n" @@ -195,37 +195,37 @@ void a64_ffinterleaved_fp16_mla_8x24( "fmla v28.8h, v4.8h, v0.h[6]\n" "fmla v31.8h, v4.8h, v0.h[7]\n" "cbz x20, 6f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q5, [x25, #0x0]\n" - "fmla v8.8h, v5.8h, v0.h[0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q7, [x21, #0x0]\n" - "fmla v11.8h, v5.8h, v0.h[1]\n" - "fmla v14.8h, v5.8h, v0.h[2]\n" - "fmla v17.8h, v5.8h, v0.h[3]\n" + "ldr q3, [%x[Apanel], #0x0]\n" + "ldr q2, [x23, #0x0]\n" + "fmla v8.8h, v2.8h, v3.h[0]\n" + "ldr q1, [x22, #0x0]\n" + "ldr q0, [x21, #0x0]\n" + "fmla v11.8h, v2.8h, v3.h[1]\n" + "fmla v14.8h, v2.8h, v3.h[2]\n" + "fmla v17.8h, v2.8h, v3.h[3]\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v20.8h, v5.8h, v0.h[4]\n" - "fmla v23.8h, v5.8h, v0.h[5]\n" - "fmla v26.8h, v5.8h, v0.h[6]\n" - "fmla v29.8h, v5.8h, v0.h[7]\n" - "fmla v9.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v0.h[1]\n" - "fmla v15.8h, v6.8h, v0.h[2]\n" - "fmla v18.8h, v6.8h, v0.h[3]\n" - "fmla v21.8h, v6.8h, v0.h[4]\n" - "fmla v24.8h, v6.8h, v0.h[5]\n" - "fmla v27.8h, v6.8h, v0.h[6]\n" - "fmla v30.8h, v6.8h, v0.h[7]\n" - "fmla v10.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v0.h[1]\n" - "fmla v16.8h, v7.8h, v0.h[2]\n" - "fmla v19.8h, v7.8h, v0.h[3]\n" - "fmla v22.8h, v7.8h, v0.h[4]\n" - "fmla v25.8h, v7.8h, v0.h[5]\n" - "fmla v28.8h, v7.8h, v0.h[6]\n" - "fmla v31.8h, v7.8h, v0.h[7]\n" + "fmla v20.8h, v2.8h, v3.h[4]\n" + "fmla v23.8h, v2.8h, v3.h[5]\n" + "fmla v26.8h, v2.8h, v3.h[6]\n" + "fmla v29.8h, v2.8h, v3.h[7]\n" + "fmla v9.8h, v1.8h, v3.h[0]\n" + "fmla v12.8h, v1.8h, v3.h[1]\n" + "fmla v15.8h, v1.8h, v3.h[2]\n" + "fmla v18.8h, v1.8h, v3.h[3]\n" + "fmla v21.8h, v1.8h, v3.h[4]\n" + "fmla v24.8h, v1.8h, v3.h[5]\n" + "fmla v27.8h, v1.8h, v3.h[6]\n" + "fmla v30.8h, v1.8h, v3.h[7]\n" + "fmla v10.8h, v0.8h, v3.h[0]\n" + "fmla v13.8h, v0.8h, v3.h[1]\n" + "fmla v16.8h, v0.8h, v3.h[2]\n" + "fmla v19.8h, v0.8h, v3.h[3]\n" + "fmla v22.8h, v0.8h, v3.h[4]\n" + "fmla v25.8h, v0.8h, v3.h[5]\n" + "fmla v28.8h, v0.8h, v3.h[6]\n" + "fmla v31.8h, v0.8h, v3.h[7]\n" "6:" // multiply loop done - "subs x24, x24, #0x18\n" + "subs x25, x25, #0x18\n" "str q8, [%x[Cpanel], #0x0]\n" "str q9, [%x[Cpanel], #0x10]\n" "str q10, [%x[Cpanel], #0x20]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp index f2a836c9b4..c4445ba14a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp index ec99d64f4a..6de0a380eb 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_ffinterleaved_fp32_mla_8x12/generic.cpp @@ -51,29 +51,29 @@ void a64_ffinterleaved_fp32_mla_8x12( __asm__ __volatile__( "1:" // Height loop - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x24, [%x[args_ptr], %[offsetof_N]]\n" - "str x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x23, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x24, %x[Apanel]\n" "2:" // Width loop - "ldr x25, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x23, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" - "add x22, x25, x20, LSL #2\n" + "add x22, x23, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" "add x20, x21, x20, LSL #2\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "cmp x24, #0x8\n" - "mov %x[Apanel], x23\n" + "cmp x25, #0x8\n" + "mov %x[Apanel], x24\n" "bgt 3f\n" - "cmp x24, #0x4\n" - "mov x21, x25\n" + "cmp x25, #0x4\n" + "mov x21, x23\n" "bgt 3f\n" - "mov x22, x25\n" + "mov x22, x23\n" "3:" // B setup done "ldr q0, [%x[Apanel], #0x0]\n" "ldr q1, [%x[Apanel], #0x10]\n" "movi v8.16b, #0x0\n" - "ldr q4, [x25, #0x0]\n" + "ldr q4, [x23, #0x0]\n" "ldr q5, [x22, #0x0]\n" "movi v9.16b, #0x0\n" "ldr q6, [x21, #0x0]\n" @@ -103,10 +103,10 @@ void a64_ffinterleaved_fp32_mla_8x12( "movi v31.16b, #0x0\n" "blt 5f\n" "4:" // main loop head - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" + "ldr q3, [%x[Apanel], #0x20]\n" + "ldr q7, [%x[Apanel], #0x30]\n" "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q7, [x25, #0x10]\n" + "ldr q2, [x23, #0x10]\n" "fmla v11.4s, v4.4s, v0.s[1]\n" "fmla v14.4s, v4.4s, v0.s[2]\n" "fmla v17.4s, v4.4s, v0.s[3]\n" @@ -136,36 +136,36 @@ void a64_ffinterleaved_fp32_mla_8x12( "fmla v28.4s, v6.4s, v1.s[2]\n" "fmla v31.4s, v6.4s, v1.s[3]\n" "ldr q1, [%x[Apanel], #0x50]\n" - "ldr q6, [x25, #0x20]\n" - "fmla v8.4s, v7.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v2.s[1]\n" - "fmla v14.4s, v7.4s, v2.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v20.4s, v7.4s, v3.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v26.4s, v7.4s, v3.s[2]\n" - "fmla v29.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x22, #0x20]\n" - "fmla v9.4s, v4.4s, v2.s[0]\n" - "fmla v12.4s, v4.4s, v2.s[1]\n" - "fmla v15.4s, v4.4s, v2.s[2]\n" - "fmla v18.4s, v4.4s, v2.s[3]\n" - "fmla v21.4s, v4.4s, v3.s[0]\n" - "fmla v24.4s, v4.4s, v3.s[1]\n" - "fmla v27.4s, v4.4s, v3.s[2]\n" - "fmla v30.4s, v4.4s, v3.s[3]\n" + "ldr q6, [x23, #0x20]\n" + "fmla v8.4s, v2.4s, v3.s[0]\n" + "fmla v11.4s, v2.4s, v3.s[1]\n" + "fmla v14.4s, v2.4s, v3.s[2]\n" + "fmla v17.4s, v2.4s, v3.s[3]\n" + "fmla v20.4s, v2.4s, v7.s[0]\n" + "fmla v23.4s, v2.4s, v7.s[1]\n" + "fmla v26.4s, v2.4s, v7.s[2]\n" + "fmla v29.4s, v2.4s, v7.s[3]\n" + "ldr q2, [x22, #0x20]\n" + "fmla v9.4s, v4.4s, v3.s[0]\n" + "fmla v12.4s, v4.4s, v3.s[1]\n" + "fmla v15.4s, v4.4s, v3.s[2]\n" + "fmla v18.4s, v4.4s, v3.s[3]\n" + "fmla v21.4s, v4.4s, v7.s[0]\n" + "fmla v24.4s, v4.4s, v7.s[1]\n" + "fmla v27.4s, v4.4s, v7.s[2]\n" + "fmla v30.4s, v4.4s, v7.s[3]\n" "ldr q4, [x21, #0x20]\n" - "fmla v10.4s, v5.4s, v2.s[0]\n" - "fmla v13.4s, v5.4s, v2.s[1]\n" - "fmla v16.4s, v5.4s, v2.s[2]\n" - "fmla v19.4s, v5.4s, v2.s[3]\n" - "ldr q2, [%x[Apanel], #0x60]\n" - "fmla v22.4s, v5.4s, v3.s[0]\n" - "fmla v25.4s, v5.4s, v3.s[1]\n" - "fmla v28.4s, v5.4s, v3.s[2]\n" - "fmla v31.4s, v5.4s, v3.s[3]\n" - "ldr q3, [%x[Apanel], #0x70]\n" - "ldr q5, [x25, #0x30]\n" + "fmla v10.4s, v5.4s, v3.s[0]\n" + "fmla v13.4s, v5.4s, v3.s[1]\n" + "fmla v16.4s, v5.4s, v3.s[2]\n" + "fmla v19.4s, v5.4s, v3.s[3]\n" + "ldr q3, [%x[Apanel], #0x60]\n" + "fmla v22.4s, v5.4s, v7.s[0]\n" + "fmla v25.4s, v5.4s, v7.s[1]\n" + "fmla v28.4s, v5.4s, v7.s[2]\n" + "fmla v31.4s, v5.4s, v7.s[3]\n" + "ldr q7, [%x[Apanel], #0x70]\n" + "ldr q5, [x23, #0x30]\n" "fmla v8.4s, v6.4s, v0.s[0]\n" "fmla v11.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v0.s[2]\n" @@ -173,20 +173,20 @@ void a64_ffinterleaved_fp32_mla_8x12( "add %x[Apanel], %x[Apanel], #0x80\n" "fmla v20.4s, v6.4s, v1.s[0]\n" "fmla v23.4s, v6.4s, v1.s[1]\n" - "add x25, x25, #0x40\n" + "add x23, x23, #0x40\n" "fmla v26.4s, v6.4s, v1.s[2]\n" "fmla v29.4s, v6.4s, v1.s[3]\n" "ldr q6, [x22, #0x30]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v12.4s, v7.4s, v0.s[1]\n" + "fmla v9.4s, v2.4s, v0.s[0]\n" + "fmla v12.4s, v2.4s, v0.s[1]\n" "add x22, x22, #0x40\n" - "fmla v15.4s, v7.4s, v0.s[2]\n" - "fmla v18.4s, v7.4s, v0.s[3]\n" - "fmla v21.4s, v7.4s, v1.s[0]\n" - "fmla v24.4s, v7.4s, v1.s[1]\n" - "fmla v27.4s, v7.4s, v1.s[2]\n" - "fmla v30.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x21, #0x30]\n" + "fmla v15.4s, v2.4s, v0.s[2]\n" + "fmla v18.4s, v2.4s, v0.s[3]\n" + "fmla v21.4s, v2.4s, v1.s[0]\n" + "fmla v24.4s, v2.4s, v1.s[1]\n" + "fmla v27.4s, v2.4s, v1.s[2]\n" + "fmla v30.4s, v2.4s, v1.s[3]\n" + "ldr q2, [x21, #0x30]\n" "fmla v10.4s, v4.4s, v0.s[0]\n" "fmla v13.4s, v4.4s, v0.s[1]\n" "add x21, x21, #0x40\n" @@ -198,33 +198,33 @@ void a64_ffinterleaved_fp32_mla_8x12( "fmla v28.4s, v4.4s, v1.s[2]\n" "fmla v31.4s, v4.4s, v1.s[3]\n" "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q4, [x25, #0x0]\n" - "fmla v8.4s, v5.4s, v2.s[0]\n" - "fmla v11.4s, v5.4s, v2.s[1]\n" - "fmla v14.4s, v5.4s, v2.s[2]\n" - "fmla v17.4s, v5.4s, v2.s[3]\n" - "fmla v20.4s, v5.4s, v3.s[0]\n" - "fmla v23.4s, v5.4s, v3.s[1]\n" - "fmla v26.4s, v5.4s, v3.s[2]\n" - "fmla v29.4s, v5.4s, v3.s[3]\n" + "ldr q4, [x23, #0x0]\n" + "fmla v8.4s, v5.4s, v3.s[0]\n" + "fmla v11.4s, v5.4s, v3.s[1]\n" + "fmla v14.4s, v5.4s, v3.s[2]\n" + "fmla v17.4s, v5.4s, v3.s[3]\n" + "fmla v20.4s, v5.4s, v7.s[0]\n" + "fmla v23.4s, v5.4s, v7.s[1]\n" + "fmla v26.4s, v5.4s, v7.s[2]\n" + "fmla v29.4s, v5.4s, v7.s[3]\n" "ldr q5, [x22, #0x0]\n" - "fmla v9.4s, v6.4s, v2.s[0]\n" - "fmla v12.4s, v6.4s, v2.s[1]\n" - "fmla v15.4s, v6.4s, v2.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v21.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v3.s[1]\n" - "fmla v27.4s, v6.4s, v3.s[2]\n" - "fmla v30.4s, v6.4s, v3.s[3]\n" + "fmla v9.4s, v6.4s, v3.s[0]\n" + "fmla v12.4s, v6.4s, v3.s[1]\n" + "fmla v15.4s, v6.4s, v3.s[2]\n" + "fmla v18.4s, v6.4s, v3.s[3]\n" + "fmla v21.4s, v6.4s, v7.s[0]\n" + "fmla v24.4s, v6.4s, v7.s[1]\n" + "fmla v27.4s, v6.4s, v7.s[2]\n" + "fmla v30.4s, v6.4s, v7.s[3]\n" "ldr q6, [x21, #0x0]\n" - "fmla v10.4s, v7.4s, v2.s[0]\n" - "fmla v13.4s, v7.4s, v2.s[1]\n" - "fmla v16.4s, v7.4s, v2.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v22.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v3.s[1]\n" - "fmla v28.4s, v7.4s, v3.s[2]\n" - "fmla v31.4s, v7.4s, v3.s[3]\n" + "fmla v10.4s, v2.4s, v3.s[0]\n" + "fmla v13.4s, v2.4s, v3.s[1]\n" + "fmla v16.4s, v2.4s, v3.s[2]\n" + "fmla v19.4s, v2.4s, v3.s[3]\n" + "fmla v22.4s, v2.4s, v7.s[0]\n" + "fmla v25.4s, v2.4s, v7.s[1]\n" + "fmla v28.4s, v2.4s, v7.s[2]\n" + "fmla v31.4s, v2.4s, v7.s[3]\n" "bge 4b\n" "5:" // main loop skip "fmla v8.4s, v4.4s, v0.s[0]\n" @@ -232,7 +232,7 @@ void a64_ffinterleaved_fp32_mla_8x12( "add %x[Apanel], %x[Apanel], #0x20\n" "fmla v14.4s, v4.4s, v0.s[2]\n" "fmla v17.4s, v4.4s, v0.s[3]\n" - "add x25, x25, #0x10\n" + "add x23, x23, #0x10\n" "fmla v20.4s, v4.4s, v1.s[0]\n" "fmla v23.4s, v4.4s, v1.s[1]\n" "add x22, x22, #0x10\n" @@ -257,43 +257,43 @@ void a64_ffinterleaved_fp32_mla_8x12( "fmla v31.4s, v6.4s, v1.s[3]\n" "cbz x20, 7f\n" "6:" // odd loop - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [%x[Apanel], #0x0]\n" + "ldr q3, [%x[Apanel], #0x10]\n" "subs x20, x20, #0x1\n" - "ldr q7, [x25, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "fmla v8.4s, v7.4s, v0.s[0]\n" - "ldr q5, [x21, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v14.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v0.s[3]\n" - "fmla v20.4s, v7.4s, v1.s[0]\n" + "ldr q2, [x23, #0x0]\n" + "ldr q1, [x22, #0x0]\n" + "fmla v8.4s, v2.4s, v4.s[0]\n" + "ldr q0, [x21, #0x0]\n" + "fmla v11.4s, v2.4s, v4.s[1]\n" + "fmla v14.4s, v2.4s, v4.s[2]\n" + "fmla v17.4s, v2.4s, v4.s[3]\n" + "fmla v20.4s, v2.4s, v3.s[0]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v23.4s, v7.4s, v1.s[1]\n" - "fmla v26.4s, v7.4s, v1.s[2]\n" - "add x25, x25, #0x10\n" - "fmla v29.4s, v7.4s, v1.s[3]\n" - "fmla v9.4s, v4.4s, v0.s[0]\n" + "fmla v23.4s, v2.4s, v3.s[1]\n" + "fmla v26.4s, v2.4s, v3.s[2]\n" + "add x23, x23, #0x10\n" + "fmla v29.4s, v2.4s, v3.s[3]\n" + "fmla v9.4s, v1.4s, v4.s[0]\n" "add x22, x22, #0x10\n" - "fmla v12.4s, v4.4s, v0.s[1]\n" - "fmla v15.4s, v4.4s, v0.s[2]\n" + "fmla v12.4s, v1.4s, v4.s[1]\n" + "fmla v15.4s, v1.4s, v4.s[2]\n" "add x21, x21, #0x10\n" - "fmla v18.4s, v4.4s, v0.s[3]\n" - "fmla v21.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v1.s[1]\n" - "fmla v27.4s, v4.4s, v1.s[2]\n" - "fmla v30.4s, v4.4s, v1.s[3]\n" - "fmla v10.4s, v5.4s, v0.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[1]\n" - "fmla v16.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v0.s[3]\n" - "fmla v22.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v1.s[1]\n" - "fmla v28.4s, v5.4s, v1.s[2]\n" - "fmla v31.4s, v5.4s, v1.s[3]\n" + "fmla v18.4s, v1.4s, v4.s[3]\n" + "fmla v21.4s, v1.4s, v3.s[0]\n" + "fmla v24.4s, v1.4s, v3.s[1]\n" + "fmla v27.4s, v1.4s, v3.s[2]\n" + "fmla v30.4s, v1.4s, v3.s[3]\n" + "fmla v10.4s, v0.4s, v4.s[0]\n" + "fmla v13.4s, v0.4s, v4.s[1]\n" + "fmla v16.4s, v0.4s, v4.s[2]\n" + "fmla v19.4s, v0.4s, v4.s[3]\n" + "fmla v22.4s, v0.4s, v3.s[0]\n" + "fmla v25.4s, v0.4s, v3.s[1]\n" + "fmla v28.4s, v0.4s, v3.s[2]\n" + "fmla v31.4s, v0.4s, v3.s[3]\n" "bne 6b\n" "7:" // multiply loop done - "subs x24, x24, #0xc\n" + "subs x25, x25, #0xc\n" "str q8, [%x[Cpanel], #0x0]\n" "str q9, [%x[Cpanel], #0x10]\n" "str q10, [%x[Cpanel], #0x20]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp index 3b8770e153..f1427669ea 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -99,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp index 02d2434356..fc323ea4fc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp @@ -93,7 +93,6 @@ void a64_hybrid_bf16fp32_dot_6x16 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 176f\n" @@ -190,11 +189,11 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -211,37 +210,37 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "blt 19f\n" "18:" // Height 1: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0xf0]\n" "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x10\n" "add x10, x10, #0x100\n" @@ -251,37 +250,37 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "bge 18b\n" "19:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x8\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "20:" // Height 1: Multiply loop: Main loop skip @@ -289,31 +288,31 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "cmp x27, #0x2\n" "blt 22f\n" "21:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x10, #0x0]\n" + ".inst 0x4f52f208 // bfdot v8.4s, v16.8h, v18.h[0]\n" "sub x27, x27, #0x2\n" - "ldr q7, [x10, #0x10]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "ldr q16, [x10, #0x10]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f52f209 // bfdot v9.4s, v16.8h, v18.h[0]\n" "cmp x27, #0x2\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f52f22a // bfdot v10.4s, v17.8h, v18.h[0]\n" + ".inst 0x4f52f20b // bfdot v11.4s, v16.8h, v18.h[0]\n" "add x10, x10, #0x40\n" "bge 21b\n" "22:" // Height 1: Multiply loop: Skip odd blocks "cbz x27, 24f\n" "ldr h0, [x26, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f40f228 // bfdot v8.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n" + "ldr q17, [x10, #0x20]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" "add x10, x10, #0x40\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -323,17 +322,17 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" "25:" // Height 1: No activation "cmp x11, #0x10\n" "bge 34f\n" @@ -511,12 +510,12 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "50:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 51f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 52f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -524,7 +523,7 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "b 52f\n" "51:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "52:" // Height 2: input setup done "cmp x27, #0x8\n" "blt 55f\n" @@ -537,156 +536,156 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "53:" // Height 2: Multiply loop: Main loop head ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "sub x27, x27, #0x8\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x26, x26, #0x10\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n" + "ldr q17, [x10, #0x40]\n" "add x25, x25, #0x10\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" + ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n" + "ldr q16, [x10, #0x50]\n" "cmp x27, #0x10\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n" + ".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n" + ".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n" + ".inst 0x4f61f22e // bfdot v14.4s, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n" + ".inst 0x4f61f20f // bfdot v15.4s, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n" + ".inst 0x4f41fa2c // bfdot v12.4s, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n" + ".inst 0x4f41fa0d // bfdot v13.4s, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n" + ".inst 0x4f41fa2e // bfdot v14.4s, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n" + ".inst 0x4f41fa0f // bfdot v15.4s, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f61fa2c // bfdot v12.4s, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n" + ".inst 0x4f61fa0d // bfdot v13.4s, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" + ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f61fa2e // bfdot v14.4s, v17.8h, v1.h[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f61fa0f // bfdot v15.4s, v16.8h, v1.h[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 53b\n" "54:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "add x26, x26, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x25, x25, #0x10\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n" + "ldr q17, [x10, #0x40]\n" "sub x27, x27, #0x8\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" + ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n" + "ldr q16, [x10, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x4f60f228 // bfdot v8.4s, v17.8h, v0.h[1]\n" + ".inst 0x4f61f22c // bfdot v12.4s, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f60f209 // bfdot v9.4s, v16.8h, v0.h[1]\n" + ".inst 0x4f61f20d // bfdot v13.4s, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4f60f22a // bfdot v10.4s, v17.8h, v0.h[1]\n" + ".inst 0x4f61f22e // bfdot v14.4s, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4f60f20b // bfdot v11.4s, v16.8h, v0.h[1]\n" + ".inst 0x4f61f20f // bfdot v15.4s, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f40fa28 // bfdot v8.4s, v17.8h, v0.h[2]\n" + ".inst 0x4f41fa2c // bfdot v12.4s, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f40fa09 // bfdot v9.4s, v16.8h, v0.h[2]\n" + ".inst 0x4f41fa0d // bfdot v13.4s, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f40fa2a // bfdot v10.4s, v17.8h, v0.h[2]\n" + ".inst 0x4f41fa2e // bfdot v14.4s, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f40fa0b // bfdot v11.4s, v16.8h, v0.h[2]\n" + ".inst 0x4f41fa0f // bfdot v15.4s, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4f60fa28 // bfdot v8.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f61fa2c // bfdot v12.4s, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4f60fa09 // bfdot v9.4s, v16.8h, v0.h[3]\n" + ".inst 0x4f61fa0d // bfdot v13.4s, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f60fa2a // bfdot v10.4s, v17.8h, v0.h[3]\n" + ".inst 0x4f61fa2e // bfdot v14.4s, v17.8h, v1.h[3]\n" + ".inst 0x4f60fa0b // bfdot v11.4s, v16.8h, v0.h[3]\n" + ".inst 0x4f61fa0f // bfdot v15.4s, v16.8h, v1.h[3]\n" "55:" // Height 2: Multiply loop: Main loop skip "cbz x27, 59f\n" "cmp x27, #0x2\n" "blt 57f\n" "56:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x2\n" "cmp x27, #0x2\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f53f228 // bfdot v8.4s, v17.8h, v19.h[0]\n" + ".inst 0x4f52f22c // bfdot v12.4s, v17.8h, v18.h[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f53f209 // bfdot v9.4s, v16.8h, v19.h[0]\n" + ".inst 0x4f52f20d // bfdot v13.4s, v16.8h, v18.h[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f53f22a // bfdot v10.4s, v17.8h, v19.h[0]\n" + ".inst 0x4f52f22e // bfdot v14.4s, v17.8h, v18.h[0]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f53f20b // bfdot v11.4s, v16.8h, v19.h[0]\n" + ".inst 0x4f52f20f // bfdot v15.4s, v16.8h, v18.h[0]\n" "bge 56b\n" "57:" // Height 2: Multiply loop: Skip odd blocks "cbz x27, 59f\n" "ldr h0, [x26, #0x0]\n" "ldr h1, [x25, #0x0]\n" "58:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f40f228 // bfdot v8.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f41f22c // bfdot v12.4s, v17.8h, v1.h[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f40f209 // bfdot v9.4s, v16.8h, v0.h[0]\n" + ".inst 0x4f41f20d // bfdot v13.4s, v16.8h, v1.h[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f40f22a // bfdot v10.4s, v17.8h, v0.h[0]\n" + ".inst 0x4f41f22e // bfdot v14.4s, v17.8h, v1.h[0]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f40f20b // bfdot v11.4s, v16.8h, v0.h[0]\n" + ".inst 0x4f41f20f // bfdot v15.4s, v16.8h, v1.h[0]\n" "59:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -698,25 +697,25 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pstl1keep, [x25, #0x0]\n" "tbz %x[flags], #1, 60f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmin v12.4s, v12.4s, v17.4s\n" + "fmin v13.4s, v13.4s, v17.4s\n" + "fmin v14.4s, v14.4s, v17.4s\n" + "fmin v15.4s, v15.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" + "fmax v14.4s, v14.4s, v16.4s\n" + "fmax v15.4s, v15.4s, v16.4s\n" "60:" // Height 2: No activation "cmp x11, #0x10\n" "bge 69f\n" @@ -943,13 +942,13 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "85:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 86f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 87f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -958,8 +957,8 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "b 87f\n" "86:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "87:" // Height 3: input setup done "cmp x27, #0x8\n" "blt 90f\n" @@ -976,75 +975,75 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" "add x25, x25, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "add x24, x24, #0x10\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n" + ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n" "cmp x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n" + ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n" + "ldr q20, [x10, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f60f2a8 // bfdot v8.4s, v21.8h, v0.h[1]\n" + ".inst 0x4f61f2ac // bfdot v12.4s, v21.8h, v1.h[1]\n" + ".inst 0x4f62f2b0 // bfdot v16.4s, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x4f60f289 // bfdot v9.4s, v20.8h, v0.h[1]\n" + ".inst 0x4f61f28d // bfdot v13.4s, v20.8h, v1.h[1]\n" + ".inst 0x4f62f291 // bfdot v17.4s, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x4f60f2aa // bfdot v10.4s, v21.8h, v0.h[1]\n" + ".inst 0x4f61f2ae // bfdot v14.4s, v21.8h, v1.h[1]\n" + ".inst 0x4f62f2b2 // bfdot v18.4s, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x4f60f28b // bfdot v11.4s, v20.8h, v0.h[1]\n" + ".inst 0x4f61f28f // bfdot v15.4s, v20.8h, v1.h[1]\n" + ".inst 0x4f62f293 // bfdot v19.4s, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x4f40faa8 // bfdot v8.4s, v21.8h, v0.h[2]\n" + ".inst 0x4f41faac // bfdot v12.4s, v21.8h, v1.h[2]\n" + ".inst 0x4f42fab0 // bfdot v16.4s, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x4f40fa89 // bfdot v9.4s, v20.8h, v0.h[2]\n" + ".inst 0x4f41fa8d // bfdot v13.4s, v20.8h, v1.h[2]\n" + ".inst 0x4f42fa91 // bfdot v17.4s, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x4f40faaa // bfdot v10.4s, v21.8h, v0.h[2]\n" + ".inst 0x4f41faae // bfdot v14.4s, v21.8h, v1.h[2]\n" + ".inst 0x4f42fab2 // bfdot v18.4s, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x4f40fa8b // bfdot v11.4s, v20.8h, v0.h[2]\n" + ".inst 0x4f41fa8f // bfdot v15.4s, v20.8h, v1.h[2]\n" + ".inst 0x4f42fa93 // bfdot v19.4s, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x4f60faa8 // bfdot v8.4s, v21.8h, v0.h[3]\n" + ".inst 0x4f61faac // bfdot v12.4s, v21.8h, v1.h[3]\n" + ".inst 0x4f62fab0 // bfdot v16.4s, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x4f60fa89 // bfdot v9.4s, v20.8h, v0.h[3]\n" + ".inst 0x4f61fa8d // bfdot v13.4s, v20.8h, v1.h[3]\n" + ".inst 0x4f62fa91 // bfdot v17.4s, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" + ".inst 0x4f60faaa // bfdot v10.4s, v21.8h, v0.h[3]\n" + ".inst 0x4f61faae // bfdot v14.4s, v21.8h, v1.h[3]\n" + ".inst 0x4f62fab2 // bfdot v18.4s, v21.8h, v2.h[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fa8b // bfdot v11.4s, v20.8h, v0.h[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f61fa8f // bfdot v15.4s, v20.8h, v1.h[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f62fa93 // bfdot v19.4s, v20.8h, v2.h[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 88b\n" @@ -1054,98 +1053,98 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" "add x24, x24, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "sub x27, x27, #0x8\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n" + ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n" + ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n" + "ldr q20, [x10, #0x50]\n" + ".inst 0x4f60f2a8 // bfdot v8.4s, v21.8h, v0.h[1]\n" + ".inst 0x4f61f2ac // bfdot v12.4s, v21.8h, v1.h[1]\n" + ".inst 0x4f62f2b0 // bfdot v16.4s, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x4f60f289 // bfdot v9.4s, v20.8h, v0.h[1]\n" + ".inst 0x4f61f28d // bfdot v13.4s, v20.8h, v1.h[1]\n" + ".inst 0x4f62f291 // bfdot v17.4s, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x4f60f2aa // bfdot v10.4s, v21.8h, v0.h[1]\n" + ".inst 0x4f61f2ae // bfdot v14.4s, v21.8h, v1.h[1]\n" + ".inst 0x4f62f2b2 // bfdot v18.4s, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x4f60f28b // bfdot v11.4s, v20.8h, v0.h[1]\n" + ".inst 0x4f61f28f // bfdot v15.4s, v20.8h, v1.h[1]\n" + ".inst 0x4f62f293 // bfdot v19.4s, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x4f40faa8 // bfdot v8.4s, v21.8h, v0.h[2]\n" + ".inst 0x4f41faac // bfdot v12.4s, v21.8h, v1.h[2]\n" + ".inst 0x4f42fab0 // bfdot v16.4s, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x4f40fa89 // bfdot v9.4s, v20.8h, v0.h[2]\n" + ".inst 0x4f41fa8d // bfdot v13.4s, v20.8h, v1.h[2]\n" + ".inst 0x4f42fa91 // bfdot v17.4s, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x4f40faaa // bfdot v10.4s, v21.8h, v0.h[2]\n" + ".inst 0x4f41faae // bfdot v14.4s, v21.8h, v1.h[2]\n" + ".inst 0x4f42fab2 // bfdot v18.4s, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x4f40fa8b // bfdot v11.4s, v20.8h, v0.h[2]\n" + ".inst 0x4f41fa8f // bfdot v15.4s, v20.8h, v1.h[2]\n" + ".inst 0x4f42fa93 // bfdot v19.4s, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x4f60faa8 // bfdot v8.4s, v21.8h, v0.h[3]\n" + ".inst 0x4f61faac // bfdot v12.4s, v21.8h, v1.h[3]\n" + ".inst 0x4f62fab0 // bfdot v16.4s, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x4f60fa89 // bfdot v9.4s, v20.8h, v0.h[3]\n" + ".inst 0x4f61fa8d // bfdot v13.4s, v20.8h, v1.h[3]\n" + ".inst 0x4f62fa91 // bfdot v17.4s, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f60faaa // bfdot v10.4s, v21.8h, v0.h[3]\n" + ".inst 0x4f61faae // bfdot v14.4s, v21.8h, v1.h[3]\n" + ".inst 0x4f62fab2 // bfdot v18.4s, v21.8h, v2.h[3]\n" + ".inst 0x4f60fa8b // bfdot v11.4s, v20.8h, v0.h[3]\n" + ".inst 0x4f61fa8f // bfdot v15.4s, v20.8h, v1.h[3]\n" + ".inst 0x4f62fa93 // bfdot v19.4s, v20.8h, v2.h[3]\n" "90:" // Height 3: Multiply loop: Main loop skip "cbz x27, 94f\n" "cmp x27, #0x2\n" "blt 92f\n" "91:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x2\n" "cmp x27, #0x2\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x10, #0x0]\n" + ".inst 0x4f58f2a8 // bfdot v8.4s, v21.8h, v24.h[0]\n" + ".inst 0x4f57f2ac // bfdot v12.4s, v21.8h, v23.h[0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x4f56f2b0 // bfdot v16.4s, v21.8h, v22.h[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x4f58f289 // bfdot v9.4s, v20.8h, v24.h[0]\n" + ".inst 0x4f57f28d // bfdot v13.4s, v20.8h, v23.h[0]\n" + ".inst 0x4f56f291 // bfdot v17.4s, v20.8h, v22.h[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f58f2aa // bfdot v10.4s, v21.8h, v24.h[0]\n" + ".inst 0x4f57f2ae // bfdot v14.4s, v21.8h, v23.h[0]\n" + ".inst 0x4f56f2b2 // bfdot v18.4s, v21.8h, v22.h[0]\n" + ".inst 0x4f58f28b // bfdot v11.4s, v20.8h, v24.h[0]\n" + ".inst 0x4f57f28f // bfdot v15.4s, v20.8h, v23.h[0]\n" + ".inst 0x4f56f293 // bfdot v19.4s, v20.8h, v22.h[0]\n" "bge 91b\n" "92:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 94f\n" @@ -1153,23 +1152,23 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ldr h1, [x25, #0x0]\n" "ldr h2, [x24, #0x0]\n" "93:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q21, [x10, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x4f40f2a8 // bfdot v8.4s, v21.8h, v0.h[0]\n" + ".inst 0x4f41f2ac // bfdot v12.4s, v21.8h, v1.h[0]\n" + ".inst 0x4f42f2b0 // bfdot v16.4s, v21.8h, v2.h[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x4f40f289 // bfdot v9.4s, v20.8h, v0.h[0]\n" + ".inst 0x4f41f28d // bfdot v13.4s, v20.8h, v1.h[0]\n" + ".inst 0x4f42f291 // bfdot v17.4s, v20.8h, v2.h[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f40f2aa // bfdot v10.4s, v21.8h, v0.h[0]\n" + ".inst 0x4f41f2ae // bfdot v14.4s, v21.8h, v1.h[0]\n" + ".inst 0x4f42f2b2 // bfdot v18.4s, v21.8h, v2.h[0]\n" + ".inst 0x4f40f28b // bfdot v11.4s, v20.8h, v0.h[0]\n" + ".inst 0x4f41f28f // bfdot v15.4s, v20.8h, v1.h[0]\n" + ".inst 0x4f42f293 // bfdot v19.4s, v20.8h, v2.h[0]\n" "94:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1183,33 +1182,33 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pstl1keep, [x24, #0x0]\n" "tbz %x[flags], #1, 95f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v21.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v21.4s\n" + "fmin v9.4s, v9.4s, v21.4s\n" + "fmin v10.4s, v10.4s, v21.4s\n" + "fmin v11.4s, v11.4s, v21.4s\n" + "fmin v12.4s, v12.4s, v21.4s\n" + "fmin v13.4s, v13.4s, v21.4s\n" + "fmin v14.4s, v14.4s, v21.4s\n" + "fmin v15.4s, v15.4s, v21.4s\n" + "fmin v16.4s, v16.4s, v21.4s\n" + "fmin v17.4s, v17.4s, v21.4s\n" + "fmin v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "95:" // Height 3: No activation "cmp x11, #0x10\n" "bge 104f\n" @@ -1485,14 +1484,14 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "120:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 121f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 122f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1502,9 +1501,9 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "b 122f\n" "121:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "122:" // Height 4: input setup done "cmp x27, #0x8\n" "blt 125f\n" @@ -1523,7 +1522,7 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x26, x26, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x25, x25, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" @@ -1531,85 +1530,85 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x23, x23, #0x10\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "cmp x27, #0x10\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n" + ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n" + ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n" + ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n" + ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x4f60f328 // bfdot v8.4s, v25.8h, v0.h[1]\n" + ".inst 0x4f61f32c // bfdot v12.4s, v25.8h, v1.h[1]\n" + ".inst 0x4f62f330 // bfdot v16.4s, v25.8h, v2.h[1]\n" + ".inst 0x4f63f334 // bfdot v20.4s, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x4f60f309 // bfdot v9.4s, v24.8h, v0.h[1]\n" + ".inst 0x4f61f30d // bfdot v13.4s, v24.8h, v1.h[1]\n" + ".inst 0x4f62f311 // bfdot v17.4s, v24.8h, v2.h[1]\n" + ".inst 0x4f63f315 // bfdot v21.4s, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x4f60f32a // bfdot v10.4s, v25.8h, v0.h[1]\n" + ".inst 0x4f61f32e // bfdot v14.4s, v25.8h, v1.h[1]\n" + ".inst 0x4f62f332 // bfdot v18.4s, v25.8h, v2.h[1]\n" + ".inst 0x4f63f336 // bfdot v22.4s, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x4f60f30b // bfdot v11.4s, v24.8h, v0.h[1]\n" + ".inst 0x4f61f30f // bfdot v15.4s, v24.8h, v1.h[1]\n" + ".inst 0x4f62f313 // bfdot v19.4s, v24.8h, v2.h[1]\n" + ".inst 0x4f63f317 // bfdot v23.4s, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x4f40fb28 // bfdot v8.4s, v25.8h, v0.h[2]\n" + ".inst 0x4f41fb2c // bfdot v12.4s, v25.8h, v1.h[2]\n" + ".inst 0x4f42fb30 // bfdot v16.4s, v25.8h, v2.h[2]\n" + ".inst 0x4f43fb34 // bfdot v20.4s, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x4f40fb09 // bfdot v9.4s, v24.8h, v0.h[2]\n" + ".inst 0x4f41fb0d // bfdot v13.4s, v24.8h, v1.h[2]\n" + ".inst 0x4f42fb11 // bfdot v17.4s, v24.8h, v2.h[2]\n" + ".inst 0x4f43fb15 // bfdot v21.4s, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x4f40fb2a // bfdot v10.4s, v25.8h, v0.h[2]\n" + ".inst 0x4f41fb2e // bfdot v14.4s, v25.8h, v1.h[2]\n" + ".inst 0x4f42fb32 // bfdot v18.4s, v25.8h, v2.h[2]\n" + ".inst 0x4f43fb36 // bfdot v22.4s, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x4f40fb0b // bfdot v11.4s, v24.8h, v0.h[2]\n" + ".inst 0x4f41fb0f // bfdot v15.4s, v24.8h, v1.h[2]\n" + ".inst 0x4f42fb13 // bfdot v19.4s, v24.8h, v2.h[2]\n" + ".inst 0x4f43fb17 // bfdot v23.4s, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x4f60fb28 // bfdot v8.4s, v25.8h, v0.h[3]\n" + ".inst 0x4f61fb2c // bfdot v12.4s, v25.8h, v1.h[3]\n" + ".inst 0x4f62fb30 // bfdot v16.4s, v25.8h, v2.h[3]\n" + ".inst 0x4f63fb34 // bfdot v20.4s, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x4f60fb09 // bfdot v9.4s, v24.8h, v0.h[3]\n" + ".inst 0x4f61fb0d // bfdot v13.4s, v24.8h, v1.h[3]\n" + ".inst 0x4f62fb11 // bfdot v17.4s, v24.8h, v2.h[3]\n" + ".inst 0x4f63fb15 // bfdot v21.4s, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f60fb2a // bfdot v10.4s, v25.8h, v0.h[3]\n" + ".inst 0x4f61fb2e // bfdot v14.4s, v25.8h, v1.h[3]\n" + ".inst 0x4f62fb32 // bfdot v18.4s, v25.8h, v2.h[3]\n" + ".inst 0x4f63fb36 // bfdot v22.4s, v25.8h, v3.h[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fb0b // bfdot v11.4s, v24.8h, v0.h[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f61fb0f // bfdot v15.4s, v24.8h, v1.h[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f62fb13 // bfdot v19.4s, v24.8h, v2.h[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f63fb17 // bfdot v23.4s, v24.8h, v3.h[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 123b\n" @@ -1620,7 +1619,7 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x25, x25, #0x10\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x24, x24, #0x10\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" @@ -1628,112 +1627,112 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "sub x27, x27, #0x8\n" ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n" + ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n" + ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n" + ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n" + ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n" + ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x4f60f328 // bfdot v8.4s, v25.8h, v0.h[1]\n" + ".inst 0x4f61f32c // bfdot v12.4s, v25.8h, v1.h[1]\n" + ".inst 0x4f62f330 // bfdot v16.4s, v25.8h, v2.h[1]\n" + ".inst 0x4f63f334 // bfdot v20.4s, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x4f60f309 // bfdot v9.4s, v24.8h, v0.h[1]\n" + ".inst 0x4f61f30d // bfdot v13.4s, v24.8h, v1.h[1]\n" + ".inst 0x4f62f311 // bfdot v17.4s, v24.8h, v2.h[1]\n" + ".inst 0x4f63f315 // bfdot v21.4s, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x4f60f32a // bfdot v10.4s, v25.8h, v0.h[1]\n" + ".inst 0x4f61f32e // bfdot v14.4s, v25.8h, v1.h[1]\n" + ".inst 0x4f62f332 // bfdot v18.4s, v25.8h, v2.h[1]\n" + ".inst 0x4f63f336 // bfdot v22.4s, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x4f60f30b // bfdot v11.4s, v24.8h, v0.h[1]\n" + ".inst 0x4f61f30f // bfdot v15.4s, v24.8h, v1.h[1]\n" + ".inst 0x4f62f313 // bfdot v19.4s, v24.8h, v2.h[1]\n" + ".inst 0x4f63f317 // bfdot v23.4s, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x4f40fb28 // bfdot v8.4s, v25.8h, v0.h[2]\n" + ".inst 0x4f41fb2c // bfdot v12.4s, v25.8h, v1.h[2]\n" + ".inst 0x4f42fb30 // bfdot v16.4s, v25.8h, v2.h[2]\n" + ".inst 0x4f43fb34 // bfdot v20.4s, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x4f40fb09 // bfdot v9.4s, v24.8h, v0.h[2]\n" + ".inst 0x4f41fb0d // bfdot v13.4s, v24.8h, v1.h[2]\n" + ".inst 0x4f42fb11 // bfdot v17.4s, v24.8h, v2.h[2]\n" + ".inst 0x4f43fb15 // bfdot v21.4s, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x4f40fb2a // bfdot v10.4s, v25.8h, v0.h[2]\n" + ".inst 0x4f41fb2e // bfdot v14.4s, v25.8h, v1.h[2]\n" + ".inst 0x4f42fb32 // bfdot v18.4s, v25.8h, v2.h[2]\n" + ".inst 0x4f43fb36 // bfdot v22.4s, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x4f40fb0b // bfdot v11.4s, v24.8h, v0.h[2]\n" + ".inst 0x4f41fb0f // bfdot v15.4s, v24.8h, v1.h[2]\n" + ".inst 0x4f42fb13 // bfdot v19.4s, v24.8h, v2.h[2]\n" + ".inst 0x4f43fb17 // bfdot v23.4s, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x4f60fb28 // bfdot v8.4s, v25.8h, v0.h[3]\n" + ".inst 0x4f61fb2c // bfdot v12.4s, v25.8h, v1.h[3]\n" + ".inst 0x4f62fb30 // bfdot v16.4s, v25.8h, v2.h[3]\n" + ".inst 0x4f63fb34 // bfdot v20.4s, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x4f60fb09 // bfdot v9.4s, v24.8h, v0.h[3]\n" + ".inst 0x4f61fb0d // bfdot v13.4s, v24.8h, v1.h[3]\n" + ".inst 0x4f62fb11 // bfdot v17.4s, v24.8h, v2.h[3]\n" + ".inst 0x4f63fb15 // bfdot v21.4s, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f60fb2a // bfdot v10.4s, v25.8h, v0.h[3]\n" + ".inst 0x4f61fb2e // bfdot v14.4s, v25.8h, v1.h[3]\n" + ".inst 0x4f62fb32 // bfdot v18.4s, v25.8h, v2.h[3]\n" + ".inst 0x4f63fb36 // bfdot v22.4s, v25.8h, v3.h[3]\n" + ".inst 0x4f60fb0b // bfdot v11.4s, v24.8h, v0.h[3]\n" + ".inst 0x4f61fb0f // bfdot v15.4s, v24.8h, v1.h[3]\n" + ".inst 0x4f62fb13 // bfdot v19.4s, v24.8h, v2.h[3]\n" + ".inst 0x4f63fb17 // bfdot v23.4s, v24.8h, v3.h[3]\n" "125:" // Height 4: Multiply loop: Main loop skip "cbz x27, 129f\n" "cmp x27, #0x2\n" "blt 127f\n" "126:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x2\n" "cmp x27, #0x2\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x4f5df328 // bfdot v8.4s, v25.8h, v29.h[0]\n" + ".inst 0x4f5cf32c // bfdot v12.4s, v25.8h, v28.h[0]\n" + ".inst 0x4f5bf330 // bfdot v16.4s, v25.8h, v27.h[0]\n" + ".inst 0x4f5af334 // bfdot v20.4s, v25.8h, v26.h[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x4f5df309 // bfdot v9.4s, v24.8h, v29.h[0]\n" + ".inst 0x4f5cf30d // bfdot v13.4s, v24.8h, v28.h[0]\n" + ".inst 0x4f5bf311 // bfdot v17.4s, v24.8h, v27.h[0]\n" + ".inst 0x4f5af315 // bfdot v21.4s, v24.8h, v26.h[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f5df32a // bfdot v10.4s, v25.8h, v29.h[0]\n" + ".inst 0x4f5cf32e // bfdot v14.4s, v25.8h, v28.h[0]\n" + ".inst 0x4f5bf332 // bfdot v18.4s, v25.8h, v27.h[0]\n" + ".inst 0x4f5af336 // bfdot v22.4s, v25.8h, v26.h[0]\n" + ".inst 0x4f5df30b // bfdot v11.4s, v24.8h, v29.h[0]\n" + ".inst 0x4f5cf30f // bfdot v15.4s, v24.8h, v28.h[0]\n" + ".inst 0x4f5bf313 // bfdot v19.4s, v24.8h, v27.h[0]\n" + ".inst 0x4f5af317 // bfdot v23.4s, v24.8h, v26.h[0]\n" "bge 126b\n" "127:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 129f\n" @@ -1742,27 +1741,27 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ldr h2, [x24, #0x0]\n" "ldr h3, [x23, #0x0]\n" "128:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x4f40f328 // bfdot v8.4s, v25.8h, v0.h[0]\n" + ".inst 0x4f41f32c // bfdot v12.4s, v25.8h, v1.h[0]\n" + ".inst 0x4f42f330 // bfdot v16.4s, v25.8h, v2.h[0]\n" + ".inst 0x4f43f334 // bfdot v20.4s, v25.8h, v3.h[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x4f40f309 // bfdot v9.4s, v24.8h, v0.h[0]\n" + ".inst 0x4f41f30d // bfdot v13.4s, v24.8h, v1.h[0]\n" + ".inst 0x4f42f311 // bfdot v17.4s, v24.8h, v2.h[0]\n" + ".inst 0x4f43f315 // bfdot v21.4s, v24.8h, v3.h[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f40f32a // bfdot v10.4s, v25.8h, v0.h[0]\n" + ".inst 0x4f41f32e // bfdot v14.4s, v25.8h, v1.h[0]\n" + ".inst 0x4f42f332 // bfdot v18.4s, v25.8h, v2.h[0]\n" + ".inst 0x4f43f336 // bfdot v22.4s, v25.8h, v3.h[0]\n" + ".inst 0x4f40f30b // bfdot v11.4s, v24.8h, v0.h[0]\n" + ".inst 0x4f41f30f // bfdot v15.4s, v24.8h, v1.h[0]\n" + ".inst 0x4f42f313 // bfdot v19.4s, v24.8h, v2.h[0]\n" + ".inst 0x4f43f317 // bfdot v23.4s, v24.8h, v3.h[0]\n" "129:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1778,41 +1777,41 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 130f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v25.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" + "ld1r { v24.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v25.4s\n" + "fmin v9.4s, v9.4s, v25.4s\n" + "fmin v10.4s, v10.4s, v25.4s\n" + "fmin v11.4s, v11.4s, v25.4s\n" + "fmin v12.4s, v12.4s, v25.4s\n" + "fmin v13.4s, v13.4s, v25.4s\n" + "fmin v14.4s, v14.4s, v25.4s\n" + "fmin v15.4s, v15.4s, v25.4s\n" + "fmin v16.4s, v16.4s, v25.4s\n" + "fmin v17.4s, v17.4s, v25.4s\n" + "fmin v18.4s, v18.4s, v25.4s\n" + "fmin v19.4s, v19.4s, v25.4s\n" + "fmin v20.4s, v20.4s, v25.4s\n" + "fmin v21.4s, v21.4s, v25.4s\n" + "fmin v22.4s, v22.4s, v25.4s\n" + "fmin v23.4s, v23.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v24.4s\n" + "fmax v9.4s, v9.4s, v24.4s\n" + "fmax v10.4s, v10.4s, v24.4s\n" + "fmax v11.4s, v11.4s, v24.4s\n" + "fmax v12.4s, v12.4s, v24.4s\n" + "fmax v13.4s, v13.4s, v24.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "fmax v23.4s, v23.4s, v24.4s\n" "130:" // Height 4: No activation "cmp x11, #0x10\n" "bge 139f\n" @@ -2137,15 +2136,15 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "155:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 156f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 157f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2156,10 +2155,10 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "b 157f\n" "156:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "157:" // Height 5: input setup done "cmp x27, #0x8\n" "blt 160f\n" @@ -2182,109 +2181,109 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x10\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x10\n" + ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n" + ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x4f60f3a8 // bfdot v8.4s, v29.8h, v0.h[1]\n" + ".inst 0x4f61f3ac // bfdot v12.4s, v29.8h, v1.h[1]\n" + ".inst 0x4f62f3b0 // bfdot v16.4s, v29.8h, v2.h[1]\n" + ".inst 0x4f63f3b4 // bfdot v20.4s, v29.8h, v3.h[1]\n" + ".inst 0x4f64f3b8 // bfdot v24.4s, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x4f60f389 // bfdot v9.4s, v28.8h, v0.h[1]\n" + ".inst 0x4f61f38d // bfdot v13.4s, v28.8h, v1.h[1]\n" + ".inst 0x4f62f391 // bfdot v17.4s, v28.8h, v2.h[1]\n" + ".inst 0x4f63f395 // bfdot v21.4s, v28.8h, v3.h[1]\n" + ".inst 0x4f64f399 // bfdot v25.4s, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x4f60f3aa // bfdot v10.4s, v29.8h, v0.h[1]\n" + ".inst 0x4f61f3ae // bfdot v14.4s, v29.8h, v1.h[1]\n" + ".inst 0x4f62f3b2 // bfdot v18.4s, v29.8h, v2.h[1]\n" + ".inst 0x4f63f3b6 // bfdot v22.4s, v29.8h, v3.h[1]\n" + ".inst 0x4f64f3ba // bfdot v26.4s, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x4f60f38b // bfdot v11.4s, v28.8h, v0.h[1]\n" + ".inst 0x4f61f38f // bfdot v15.4s, v28.8h, v1.h[1]\n" + ".inst 0x4f62f393 // bfdot v19.4s, v28.8h, v2.h[1]\n" + ".inst 0x4f63f397 // bfdot v23.4s, v28.8h, v3.h[1]\n" + ".inst 0x4f64f39b // bfdot v27.4s, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x4f40fba8 // bfdot v8.4s, v29.8h, v0.h[2]\n" + ".inst 0x4f41fbac // bfdot v12.4s, v29.8h, v1.h[2]\n" + ".inst 0x4f42fbb0 // bfdot v16.4s, v29.8h, v2.h[2]\n" + ".inst 0x4f43fbb4 // bfdot v20.4s, v29.8h, v3.h[2]\n" + ".inst 0x4f44fbb8 // bfdot v24.4s, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x4f40fb89 // bfdot v9.4s, v28.8h, v0.h[2]\n" + ".inst 0x4f41fb8d // bfdot v13.4s, v28.8h, v1.h[2]\n" + ".inst 0x4f42fb91 // bfdot v17.4s, v28.8h, v2.h[2]\n" + ".inst 0x4f43fb95 // bfdot v21.4s, v28.8h, v3.h[2]\n" + ".inst 0x4f44fb99 // bfdot v25.4s, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x4f40fbaa // bfdot v10.4s, v29.8h, v0.h[2]\n" + ".inst 0x4f41fbae // bfdot v14.4s, v29.8h, v1.h[2]\n" + ".inst 0x4f42fbb2 // bfdot v18.4s, v29.8h, v2.h[2]\n" + ".inst 0x4f43fbb6 // bfdot v22.4s, v29.8h, v3.h[2]\n" + ".inst 0x4f44fbba // bfdot v26.4s, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x4f40fb8b // bfdot v11.4s, v28.8h, v0.h[2]\n" + ".inst 0x4f41fb8f // bfdot v15.4s, v28.8h, v1.h[2]\n" + ".inst 0x4f42fb93 // bfdot v19.4s, v28.8h, v2.h[2]\n" + ".inst 0x4f43fb97 // bfdot v23.4s, v28.8h, v3.h[2]\n" + ".inst 0x4f44fb9b // bfdot v27.4s, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x4f60fba8 // bfdot v8.4s, v29.8h, v0.h[3]\n" + ".inst 0x4f61fbac // bfdot v12.4s, v29.8h, v1.h[3]\n" + ".inst 0x4f62fbb0 // bfdot v16.4s, v29.8h, v2.h[3]\n" + ".inst 0x4f63fbb4 // bfdot v20.4s, v29.8h, v3.h[3]\n" + ".inst 0x4f64fbb8 // bfdot v24.4s, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x4f60fb89 // bfdot v9.4s, v28.8h, v0.h[3]\n" + ".inst 0x4f61fb8d // bfdot v13.4s, v28.8h, v1.h[3]\n" + ".inst 0x4f62fb91 // bfdot v17.4s, v28.8h, v2.h[3]\n" + ".inst 0x4f63fb95 // bfdot v21.4s, v28.8h, v3.h[3]\n" + ".inst 0x4f64fb99 // bfdot v25.4s, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" - ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" + ".inst 0x4f60fbaa // bfdot v10.4s, v29.8h, v0.h[3]\n" + ".inst 0x4f61fbae // bfdot v14.4s, v29.8h, v1.h[3]\n" + ".inst 0x4f62fbb2 // bfdot v18.4s, v29.8h, v2.h[3]\n" + ".inst 0x4f63fbb6 // bfdot v22.4s, v29.8h, v3.h[3]\n" + ".inst 0x4f64fbba // bfdot v26.4s, v29.8h, v4.h[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" + ".inst 0x4f60fb8b // bfdot v11.4s, v28.8h, v0.h[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" + ".inst 0x4f61fb8f // bfdot v15.4s, v28.8h, v1.h[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" + ".inst 0x4f62fb93 // bfdot v19.4s, v28.8h, v2.h[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" + ".inst 0x4f63fb97 // bfdot v23.4s, v28.8h, v3.h[3]\n" "ldr q3, [x23, #0x0]\n" - ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f64fb9b // bfdot v27.4s, v28.8h, v4.h[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 158b\n" @@ -2298,7 +2297,7 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q29, [x10, #0x20]\n" ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" @@ -2307,131 +2306,131 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q28, [x10, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n" - ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n" - ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n" - ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n" - ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n" - ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n" - ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n" - ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n" - ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n" - ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n" - ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n" - ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n" - ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n" + ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x4f60f3a8 // bfdot v8.4s, v29.8h, v0.h[1]\n" + ".inst 0x4f61f3ac // bfdot v12.4s, v29.8h, v1.h[1]\n" + ".inst 0x4f62f3b0 // bfdot v16.4s, v29.8h, v2.h[1]\n" + ".inst 0x4f63f3b4 // bfdot v20.4s, v29.8h, v3.h[1]\n" + ".inst 0x4f64f3b8 // bfdot v24.4s, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x4f60f389 // bfdot v9.4s, v28.8h, v0.h[1]\n" + ".inst 0x4f61f38d // bfdot v13.4s, v28.8h, v1.h[1]\n" + ".inst 0x4f62f391 // bfdot v17.4s, v28.8h, v2.h[1]\n" + ".inst 0x4f63f395 // bfdot v21.4s, v28.8h, v3.h[1]\n" + ".inst 0x4f64f399 // bfdot v25.4s, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x4f60f3aa // bfdot v10.4s, v29.8h, v0.h[1]\n" + ".inst 0x4f61f3ae // bfdot v14.4s, v29.8h, v1.h[1]\n" + ".inst 0x4f62f3b2 // bfdot v18.4s, v29.8h, v2.h[1]\n" + ".inst 0x4f63f3b6 // bfdot v22.4s, v29.8h, v3.h[1]\n" + ".inst 0x4f64f3ba // bfdot v26.4s, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x4f60f38b // bfdot v11.4s, v28.8h, v0.h[1]\n" + ".inst 0x4f61f38f // bfdot v15.4s, v28.8h, v1.h[1]\n" + ".inst 0x4f62f393 // bfdot v19.4s, v28.8h, v2.h[1]\n" + ".inst 0x4f63f397 // bfdot v23.4s, v28.8h, v3.h[1]\n" + ".inst 0x4f64f39b // bfdot v27.4s, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x4f40fba8 // bfdot v8.4s, v29.8h, v0.h[2]\n" + ".inst 0x4f41fbac // bfdot v12.4s, v29.8h, v1.h[2]\n" + ".inst 0x4f42fbb0 // bfdot v16.4s, v29.8h, v2.h[2]\n" + ".inst 0x4f43fbb4 // bfdot v20.4s, v29.8h, v3.h[2]\n" + ".inst 0x4f44fbb8 // bfdot v24.4s, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x4f40fb89 // bfdot v9.4s, v28.8h, v0.h[2]\n" + ".inst 0x4f41fb8d // bfdot v13.4s, v28.8h, v1.h[2]\n" + ".inst 0x4f42fb91 // bfdot v17.4s, v28.8h, v2.h[2]\n" + ".inst 0x4f43fb95 // bfdot v21.4s, v28.8h, v3.h[2]\n" + ".inst 0x4f44fb99 // bfdot v25.4s, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x4f40fbaa // bfdot v10.4s, v29.8h, v0.h[2]\n" + ".inst 0x4f41fbae // bfdot v14.4s, v29.8h, v1.h[2]\n" + ".inst 0x4f42fbb2 // bfdot v18.4s, v29.8h, v2.h[2]\n" + ".inst 0x4f43fbb6 // bfdot v22.4s, v29.8h, v3.h[2]\n" + ".inst 0x4f44fbba // bfdot v26.4s, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x4f40fb8b // bfdot v11.4s, v28.8h, v0.h[2]\n" + ".inst 0x4f41fb8f // bfdot v15.4s, v28.8h, v1.h[2]\n" + ".inst 0x4f42fb93 // bfdot v19.4s, v28.8h, v2.h[2]\n" + ".inst 0x4f43fb97 // bfdot v23.4s, v28.8h, v3.h[2]\n" + ".inst 0x4f44fb9b // bfdot v27.4s, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x4f60fba8 // bfdot v8.4s, v29.8h, v0.h[3]\n" + ".inst 0x4f61fbac // bfdot v12.4s, v29.8h, v1.h[3]\n" + ".inst 0x4f62fbb0 // bfdot v16.4s, v29.8h, v2.h[3]\n" + ".inst 0x4f63fbb4 // bfdot v20.4s, v29.8h, v3.h[3]\n" + ".inst 0x4f64fbb8 // bfdot v24.4s, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x4f60fb89 // bfdot v9.4s, v28.8h, v0.h[3]\n" + ".inst 0x4f61fb8d // bfdot v13.4s, v28.8h, v1.h[3]\n" + ".inst 0x4f62fb91 // bfdot v17.4s, v28.8h, v2.h[3]\n" + ".inst 0x4f63fb95 // bfdot v21.4s, v28.8h, v3.h[3]\n" + ".inst 0x4f64fb99 // bfdot v25.4s, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n" - ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n" - ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n" - ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n" - ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n" - ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n" - ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n" + ".inst 0x4f60fbaa // bfdot v10.4s, v29.8h, v0.h[3]\n" + ".inst 0x4f61fbae // bfdot v14.4s, v29.8h, v1.h[3]\n" + ".inst 0x4f62fbb2 // bfdot v18.4s, v29.8h, v2.h[3]\n" + ".inst 0x4f63fbb6 // bfdot v22.4s, v29.8h, v3.h[3]\n" + ".inst 0x4f64fbba // bfdot v26.4s, v29.8h, v4.h[3]\n" + ".inst 0x4f60fb8b // bfdot v11.4s, v28.8h, v0.h[3]\n" + ".inst 0x4f61fb8f // bfdot v15.4s, v28.8h, v1.h[3]\n" + ".inst 0x4f62fb93 // bfdot v19.4s, v28.8h, v2.h[3]\n" + ".inst 0x4f63fb97 // bfdot v23.4s, v28.8h, v3.h[3]\n" + ".inst 0x4f64fb9b // bfdot v27.4s, v28.8h, v4.h[3]\n" "160:" // Height 5: Multiply loop: Main loop skip "cbz x27, 164f\n" "cmp x27, #0x2\n" "blt 162f\n" "161:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x2\n" "cmp x27, #0x2\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x10, #0x0]\n" + ".inst 0x4f42f3a8 // bfdot v8.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f41f3ac // bfdot v12.4s, v29.8h, v1.h[0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x4f40f3b0 // bfdot v16.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f5ff3b4 // bfdot v20.4s, v29.8h, v31.h[0]\n" + ".inst 0x4f5ef3b8 // bfdot v24.4s, v29.8h, v30.h[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f42f389 // bfdot v9.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f41f38d // bfdot v13.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f40f391 // bfdot v17.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f5ff395 // bfdot v21.4s, v28.8h, v31.h[0]\n" + ".inst 0x4f5ef399 // bfdot v25.4s, v28.8h, v30.h[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f42f3aa // bfdot v10.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n" + ".inst 0x4f40f3b2 // bfdot v18.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f5ff3b6 // bfdot v22.4s, v29.8h, v31.h[0]\n" + ".inst 0x4f5ef3ba // bfdot v26.4s, v29.8h, v30.h[0]\n" + ".inst 0x4f42f38b // bfdot v11.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f40f393 // bfdot v19.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f5ff397 // bfdot v23.4s, v28.8h, v31.h[0]\n" + ".inst 0x4f5ef39b // bfdot v27.4s, v28.8h, v30.h[0]\n" "bge 161b\n" "162:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 164f\n" @@ -2441,31 +2440,31 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ldr h3, [x23, #0x0]\n" "ldr h4, [x22, #0x0]\n" "163:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q29, [x10, #0x0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x4f40f3a8 // bfdot v8.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f41f3ac // bfdot v12.4s, v29.8h, v1.h[0]\n" + ".inst 0x4f42f3b0 // bfdot v16.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f43f3b4 // bfdot v20.4s, v29.8h, v3.h[0]\n" + ".inst 0x4f44f3b8 // bfdot v24.4s, v29.8h, v4.h[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f40f389 // bfdot v9.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f41f38d // bfdot v13.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f42f391 // bfdot v17.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f43f395 // bfdot v21.4s, v28.8h, v3.h[0]\n" + ".inst 0x4f44f399 // bfdot v25.4s, v28.8h, v4.h[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f40f3aa // bfdot v10.4s, v29.8h, v0.h[0]\n" + ".inst 0x4f41f3ae // bfdot v14.4s, v29.8h, v1.h[0]\n" + ".inst 0x4f42f3b2 // bfdot v18.4s, v29.8h, v2.h[0]\n" + ".inst 0x4f43f3b6 // bfdot v22.4s, v29.8h, v3.h[0]\n" + ".inst 0x4f44f3ba // bfdot v26.4s, v29.8h, v4.h[0]\n" + ".inst 0x4f40f38b // bfdot v11.4s, v28.8h, v0.h[0]\n" + ".inst 0x4f41f38f // bfdot v15.4s, v28.8h, v1.h[0]\n" + ".inst 0x4f42f393 // bfdot v19.4s, v28.8h, v2.h[0]\n" + ".inst 0x4f43f397 // bfdot v23.4s, v28.8h, v3.h[0]\n" + ".inst 0x4f44f39b // bfdot v27.4s, v28.8h, v4.h[0]\n" "164:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2483,49 +2482,49 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 165f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v29.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmin v26.4s, v26.4s, v1.4s\n" - "fmin v27.4s, v27.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" + "ld1r { v28.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v29.4s\n" + "fmin v9.4s, v9.4s, v29.4s\n" + "fmin v10.4s, v10.4s, v29.4s\n" + "fmin v11.4s, v11.4s, v29.4s\n" + "fmin v12.4s, v12.4s, v29.4s\n" + "fmin v13.4s, v13.4s, v29.4s\n" + "fmin v14.4s, v14.4s, v29.4s\n" + "fmin v15.4s, v15.4s, v29.4s\n" + "fmin v16.4s, v16.4s, v29.4s\n" + "fmin v17.4s, v17.4s, v29.4s\n" + "fmin v18.4s, v18.4s, v29.4s\n" + "fmin v19.4s, v19.4s, v29.4s\n" + "fmin v20.4s, v20.4s, v29.4s\n" + "fmin v21.4s, v21.4s, v29.4s\n" + "fmin v22.4s, v22.4s, v29.4s\n" + "fmin v23.4s, v23.4s, v29.4s\n" + "fmin v24.4s, v24.4s, v29.4s\n" + "fmin v25.4s, v25.4s, v29.4s\n" + "fmin v26.4s, v26.4s, v29.4s\n" + "fmin v27.4s, v27.4s, v29.4s\n" + "fmax v8.4s, v8.4s, v28.4s\n" + "fmax v9.4s, v9.4s, v28.4s\n" + "fmax v10.4s, v10.4s, v28.4s\n" + "fmax v11.4s, v11.4s, v28.4s\n" + "fmax v12.4s, v12.4s, v28.4s\n" + "fmax v13.4s, v13.4s, v28.4s\n" + "fmax v14.4s, v14.4s, v28.4s\n" + "fmax v15.4s, v15.4s, v28.4s\n" + "fmax v16.4s, v16.4s, v28.4s\n" + "fmax v17.4s, v17.4s, v28.4s\n" + "fmax v18.4s, v18.4s, v28.4s\n" + "fmax v19.4s, v19.4s, v28.4s\n" + "fmax v20.4s, v20.4s, v28.4s\n" + "fmax v21.4s, v21.4s, v28.4s\n" + "fmax v22.4s, v22.4s, v28.4s\n" + "fmax v23.4s, v23.4s, v28.4s\n" + "fmax v24.4s, v24.4s, v28.4s\n" + "fmax v25.4s, v25.4s, v28.4s\n" + "fmax v26.4s, v26.4s, v28.4s\n" + "fmax v27.4s, v27.4s, v28.4s\n" "165:" // Height 5: No activation "cmp x11, #0x10\n" "bge 174f\n" @@ -2902,16 +2901,16 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "190:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 191f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 192f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2923,11 +2922,11 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "b 192f\n" "191:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "192:" // Height 6: input setup done "cmp x27, #0x8\n" "blt 195f\n" @@ -3206,43 +3205,43 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "cmp x27, #0x2\n" "blt 197f\n" "196:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x2\n" "cmp x27, #0x2\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x4f47f028 // bfdot v8.4s, v1.8h, v7.h[0]\n" + ".inst 0x4f46f02c // bfdot v12.4s, v1.8h, v6.h[0]\n" + ".inst 0x4f45f030 // bfdot v16.4s, v1.8h, v5.h[0]\n" + ".inst 0x4f44f034 // bfdot v20.4s, v1.8h, v4.h[0]\n" + ".inst 0x4f43f038 // bfdot v24.4s, v1.8h, v3.h[0]\n" + ".inst 0x4f42f03c // bfdot v28.4s, v1.8h, v2.h[0]\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x4f47f009 // bfdot v9.4s, v0.8h, v7.h[0]\n" + ".inst 0x4f46f00d // bfdot v13.4s, v0.8h, v6.h[0]\n" + ".inst 0x4f45f011 // bfdot v17.4s, v0.8h, v5.h[0]\n" + ".inst 0x4f44f015 // bfdot v21.4s, v0.8h, v4.h[0]\n" + ".inst 0x4f43f019 // bfdot v25.4s, v0.8h, v3.h[0]\n" + ".inst 0x4f42f01d // bfdot v29.4s, v0.8h, v2.h[0]\n" + "ldr q0, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + ".inst 0x4f47f02a // bfdot v10.4s, v1.8h, v7.h[0]\n" + ".inst 0x4f46f02e // bfdot v14.4s, v1.8h, v6.h[0]\n" + ".inst 0x4f45f032 // bfdot v18.4s, v1.8h, v5.h[0]\n" + ".inst 0x4f44f036 // bfdot v22.4s, v1.8h, v4.h[0]\n" + ".inst 0x4f43f03a // bfdot v26.4s, v1.8h, v3.h[0]\n" + ".inst 0x4f42f03e // bfdot v30.4s, v1.8h, v2.h[0]\n" + ".inst 0x4f47f00b // bfdot v11.4s, v0.8h, v7.h[0]\n" + ".inst 0x4f46f00f // bfdot v15.4s, v0.8h, v6.h[0]\n" + ".inst 0x4f45f013 // bfdot v19.4s, v0.8h, v5.h[0]\n" + ".inst 0x4f44f017 // bfdot v23.4s, v0.8h, v4.h[0]\n" + ".inst 0x4f43f01b // bfdot v27.4s, v0.8h, v3.h[0]\n" + ".inst 0x4f42f01f // bfdot v31.4s, v0.8h, v2.h[0]\n" "bge 196b\n" "197:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 199f\n" @@ -3253,35 +3252,35 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "ldr h4, [x22, #0x0]\n" "ldr h5, [x21, #0x0]\n" "198:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n" - ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ec // bfdot v12.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f0 // bfdot v16.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f4 // bfdot v20.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0f8 // bfdot v24.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fc // bfdot v28.4s, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4f40f0c9 // bfdot v9.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0cd // bfdot v13.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d1 // bfdot v17.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d5 // bfdot v21.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0d9 // bfdot v25.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0dd // bfdot v29.4s, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n" - ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n" - ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n" - ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n" - ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n" - ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n" - ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n" - ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n" - ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n" - ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n" + ".inst 0x4f40f0ea // bfdot v10.4s, v7.8h, v0.h[0]\n" + ".inst 0x4f41f0ee // bfdot v14.4s, v7.8h, v1.h[0]\n" + ".inst 0x4f42f0f2 // bfdot v18.4s, v7.8h, v2.h[0]\n" + ".inst 0x4f43f0f6 // bfdot v22.4s, v7.8h, v3.h[0]\n" + ".inst 0x4f44f0fa // bfdot v26.4s, v7.8h, v4.h[0]\n" + ".inst 0x4f45f0fe // bfdot v30.4s, v7.8h, v5.h[0]\n" + ".inst 0x4f40f0cb // bfdot v11.4s, v6.8h, v0.h[0]\n" + ".inst 0x4f41f0cf // bfdot v15.4s, v6.8h, v1.h[0]\n" + ".inst 0x4f42f0d3 // bfdot v19.4s, v6.8h, v2.h[0]\n" + ".inst 0x4f43f0d7 // bfdot v23.4s, v6.8h, v3.h[0]\n" + ".inst 0x4f44f0db // bfdot v27.4s, v6.8h, v4.h[0]\n" + ".inst 0x4f45f0df // bfdot v31.4s, v6.8h, v5.h[0]\n" "199:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3522,7 +3521,6 @@ void a64_hybrid_bf16fp32_dot_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "212:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp index 8cb743b777..d9e7259fa2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -99,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp index 5a000c69af..f6389e27d1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp @@ -93,7 +93,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 186f\n" @@ -211,11 +210,11 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "16:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -231,41 +230,41 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 20f\n" "19:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v20.2d, v1.2d, v21.2d\n" + ".inst 0x6e47ee88 // bfmmla v8.4s, v20.8h, v7.8h\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6e46ee8c // bfmmla v12.4s, v20.8h, v6.8h\n" + "ldr q19, [x10, #0x30]\n" + ".inst 0x6e51ee89 // bfmmla v9.4s, v20.8h, v17.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e53ee8d // bfmmla v13.4s, v20.8h, v19.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ee8a // bfmmla v10.4s, v20.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ee8e // bfmmla v14.4s, v20.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v21.2d\n" + ".inst 0x6e52ee8b // bfmmla v11.4s, v20.8h, v18.8h\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e51ee8f // bfmmla v15.4s, v20.8h, v17.8h\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "ldr q1, [x26, #0x0]\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" @@ -273,40 +272,40 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "bge 19b\n" "20:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v19.2d, v1.2d, v20.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q18, [x10, #0x30]\n" + ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x50]\n" + ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x6e52ee6e // bfmmla v14.4s, v19.8h, v18.8h\n" + "ldr q24, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v20.2d\n" + ".inst 0x6e51ee6b // bfmmla v11.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6e58ee6f // bfmmla v15.4s, v19.8h, v24.8h\n" + "ldr q2, [x10, #0x90]\n" + ".inst 0x6e51ec28 // bfmmla v8.4s, v1.8h, v17.8h\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x8\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "21:" // Height 1: Multiply loop: Main loop skip @@ -314,26 +313,26 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "cmp x27, #0x4\n" "blt 23f\n" "22:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr d19, [x26], #0x8\n" + "ldr q18, [x10, #0x0]\n" + "trn1 v19.2d, v19.2d, v17.2d\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x10, x10, #0x80\n" "bge 22b\n" "23:" // Height 1: Multiply loop: Skip odd blocks @@ -346,23 +345,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr h1, [x26, #0x0]\n" "25:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q20, [x10, #0x0]\n" + "ldr q18, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v17.2d\n" + ".inst 0x6e54ee68 // bfmmla v8.4s, v19.8h, v20.8h\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6e52ee6c // bfmmla v12.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x30]\n" + ".inst 0x6e51ee69 // bfmmla v9.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x6e52ee6d // bfmmla v13.4s, v19.8h, v18.8h\n" + "ldr q2, [x10, #0x50]\n" + ".inst 0x6e51ee6a // bfmmla v10.4s, v19.8h, v17.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e42ee6e // bfmmla v14.4s, v19.8h, v2.8h\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x10, x10, #0x80\n" "26:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -376,17 +375,17 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "uzp1 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "27:" // Height 1: No activation "cmp x11, #0x10\n" "bge 36f\n" @@ -577,12 +576,12 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "53:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 54f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 55f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -590,7 +589,7 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "b 55f\n" "54:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "55:" // Height 2: input setup done "cmp x27, #0x8\n" "blt 58f\n" @@ -601,85 +600,85 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 57f\n" "56:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x10\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x10, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "bge 56b\n" "57:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ee68 // bfmmla v8.4s, v19.8h, v7.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e46ee6c // bfmmla v12.4s, v19.8h, v6.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e52ec28 // bfmmla v8.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e51ec2c // bfmmla v12.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e52ec29 // bfmmla v9.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e51ec2d // bfmmla v13.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e52ec2a // bfmmla v10.4s, v1.8h, v18.8h\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e51ec2e // bfmmla v14.4s, v1.8h, v17.8h\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e52ec2b // bfmmla v11.4s, v1.8h, v18.8h\n" + ".inst 0x6e51ec2f // bfmmla v15.4s, v1.8h, v17.8h\n" "sub x27, x27, #0x8\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" @@ -689,27 +688,27 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "cmp x27, #0x4\n" "blt 60f\n" "59:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d18, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "trn1 v19.2d, v18.2d, v17.2d\n" "sub x27, x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q6, [x10, #0x40]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q6, [x10, #0x60]\n" - "ldr q7, [x10, #0x70]\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q26, [x10, #0x20]\n" + "ldr q5, [x10, #0x30]\n" + ".inst 0x6e5aee69 // bfmmla v9.4s, v19.8h, v26.8h\n" + ".inst 0x6e45ee6d // bfmmla v13.4s, v19.8h, v5.8h\n" + "ldr q18, [x10, #0x40]\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ee6a // bfmmla v10.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6e // bfmmla v14.4s, v19.8h, v17.8h\n" + "ldr q18, [x10, #0x60]\n" + "ldr q17, [x10, #0x70]\n" "cmp x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x10, x10, #0x80\n" "bge 59b\n" "60:" // Height 2: Multiply loop: Skip odd blocks @@ -725,23 +724,23 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr h1, [x26, #0x0]\n" "ldr h2, [x25, #0x0]\n" "62:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e52ee68 // bfmmla v8.4s, v19.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ee6c // bfmmla v12.4s, v19.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ee69 // bfmmla v9.4s, v19.8h, v18.8h\n" + "ldr q30, [x10, #0x40]\n" + ".inst 0x6e51ee6d // bfmmla v13.4s, v19.8h, v17.8h\n" + "ldr q26, [x10, #0x50]\n" + ".inst 0x6e5eee6a // bfmmla v10.4s, v19.8h, v30.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e5aee6e // bfmmla v14.4s, v19.8h, v26.8h\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e52ee6b // bfmmla v11.4s, v19.8h, v18.8h\n" + ".inst 0x6e51ee6f // bfmmla v15.4s, v19.8h, v17.8h\n" "add x10, x10, #0x80\n" "63:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -762,25 +761,25 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "uzp2 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v18.4s\n" + "fmin v12.4s, v12.4s, v18.4s\n" + "fmin v13.4s, v13.4s, v18.4s\n" + "fmin v14.4s, v14.4s, v18.4s\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v7.4s, v7.4s, v17.4s\n" + "fmax v12.4s, v12.4s, v17.4s\n" + "fmax v13.4s, v13.4s, v17.4s\n" + "fmax v14.4s, v14.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "64:" // Height 2: No activation "cmp x11, #0x10\n" "bge 73f\n" @@ -1036,13 +1035,13 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "90:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 91f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 92f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1051,8 +1050,8 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "b 92f\n" "91:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "92:" // Height 3: input setup done "cmp x27, #0x8\n" "blt 95f\n" @@ -1064,167 +1063,167 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 94f\n" "93:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "sub x27, x27, #0x8\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "cmp x27, #0x10\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 93b\n" "94:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "sub x27, x27, #0x8\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "95:" // Height 3: Multiply loop: Main loop skip "cbz x27, 100f\n" "cmp x27, #0x4\n" "blt 97f\n" "96:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr q26, [x10, #0x0]\n" + "trn1 v27.2d, v25.2d, v27.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "sub x27, x27, #0x4\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "cmp x27, #0x4\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "bge 96b\n" "97:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 100f\n" @@ -1242,33 +1241,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr h2, [x25, #0x0]\n" "ldr h3, [x24, #0x0]\n" "99:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q29, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v25.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e5def8c // bfmmla v12.4s, v28.8h, v29.8h\n" + ".inst 0x6e5def74 // bfmmla v20.4s, v27.8h, v29.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "100:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1294,33 +1293,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "uzp1 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 101f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "101:" // Height 3: No activation "cmp x11, #0x10\n" "bge 110f\n" @@ -1617,14 +1616,14 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "127:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 128f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 129f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1634,9 +1633,9 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "b 129f\n" "128:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "129:" // Height 4: input setup done "cmp x27, #0x8\n" "blt 132f\n" @@ -1645,177 +1644,177 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "cmp x27, #0x10\n" "ldr q3, [x24, #0x0]\n" "ldr q4, [x23, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "blt 131f\n" - "130:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "sub x27, x27, #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "blt 131f\n" + "130:" // Height 4: Multiply loop: Main loop head + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" + "sub x27, x27, #0x8\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "add x23, x23, #0x10\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" "cmp x27, #0x10\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 130b\n" "131:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ef88 // bfmmla v8.4s, v28.8h, v7.8h\n" "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ef70 // bfmmla v16.4s, v27.8h, v7.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e46ef8c // bfmmla v12.4s, v28.8h, v6.8h\n" + ".inst 0x6e46ef74 // bfmmla v20.4s, v27.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" "add x24, x24, #0x10\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" "add x23, x23, #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" "sub x27, x27, #0x8\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x6e5aec28 // bfmmla v8.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e5aec70 // bfmmla v16.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e59ec2c // bfmmla v12.4s, v1.8h, v25.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e59ec74 // bfmmla v20.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e5aec29 // bfmmla v9.4s, v1.8h, v26.8h\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e5aec71 // bfmmla v17.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e59ec2d // bfmmla v13.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec75 // bfmmla v21.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e5aec2a // bfmmla v10.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec72 // bfmmla v18.4s, v3.8h, v26.8h\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e59ec2e // bfmmla v14.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec76 // bfmmla v22.4s, v3.8h, v25.8h\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e5aec2b // bfmmla v11.4s, v1.8h, v26.8h\n" + ".inst 0x6e5aec73 // bfmmla v19.4s, v3.8h, v26.8h\n" + ".inst 0x6e59ec2f // bfmmla v15.4s, v1.8h, v25.8h\n" + ".inst 0x6e59ec77 // bfmmla v23.4s, v3.8h, v25.8h\n" "132:" // Height 4: Multiply loop: Main loop skip "cbz x27, 137f\n" "cmp x27, #0x4\n" "blt 134f\n" "133:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "trn1 v27.2d, v26.2d, v25.2d\n" "cmp x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "bge 133b\n" "134:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 137f\n" @@ -1836,33 +1835,33 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr h3, [x24, #0x0]\n" "ldr h4, [x23, #0x0]\n" "136:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e5aef88 // bfmmla v8.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef70 // bfmmla v16.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ef8c // bfmmla v12.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef74 // bfmmla v20.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aef89 // bfmmla v9.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef71 // bfmmla v17.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ef8d // bfmmla v13.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef75 // bfmmla v21.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aef8a // bfmmla v10.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef72 // bfmmla v18.4s, v27.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ef8e // bfmmla v14.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef76 // bfmmla v22.4s, v27.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aef8b // bfmmla v11.4s, v28.8h, v26.8h\n" + ".inst 0x6e5aef73 // bfmmla v19.4s, v27.8h, v26.8h\n" + ".inst 0x6e59ef8f // bfmmla v15.4s, v28.8h, v25.8h\n" + ".inst 0x6e59ef77 // bfmmla v23.4s, v27.8h, v25.8h\n" "137:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1894,41 +1893,41 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "uzp2 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 138f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v21.4s, v21.4s, v26.4s\n" + "fmin v22.4s, v22.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v15.4s, v15.4s, v25.4s\n" + "fmax v20.4s, v20.4s, v25.4s\n" + "fmax v21.4s, v21.4s, v25.4s\n" + "fmax v22.4s, v22.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "138:" // Height 4: No activation "cmp x11, #0x10\n" "bge 147f\n" @@ -2290,15 +2289,15 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "164:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 165f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 166f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2309,10 +2308,10 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "b 166f\n" "165:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "166:" // Height 5: input setup done "cmp x27, #0x8\n" "blt 169f\n" @@ -2325,174 +2324,174 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr q7, [x10, #0x0]\n" "blt 168f\n" "167:" // Height 5: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" "sub x27, x27, #0x8\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n" "add x26, x26, #0x10\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n" "add x25, x25, #0x10\n" ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x40]\n" "add x24, x24, #0x10\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n" "cmp x27, #0x10\n" ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n" "ldr q5, [x22, #0x0]\n" "bge 167b\n" "168:" // Height 5: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ecc8 // bfmmla v8.4s, v6.8h, v7.8h\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" "add x26, x26, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccc // bfmmla v12.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h\n" "add x25, x25, #0x10\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9c // bfmmla v28.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e47ecc9 // bfmmla v9.4s, v6.8h, v7.8h\n" "add x24, x24, #0x10\n" ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x40]\n" "add x23, x23, #0x10\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e40eccd // bfmmla v13.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec55 // bfmmla v21.4s, v2.8h, v0.8h\n" "add x22, x22, #0x10\n" "sub x27, x27, #0x8\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e47ecca // bfmmla v10.4s, v6.8h, v7.8h\n" "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ecce // bfmmla v14.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec56 // bfmmla v22.4s, v2.8h, v0.8h\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40ec9e // bfmmla v30.4s, v4.8h, v0.8h\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e47eccb // bfmmla v11.4s, v6.8h, v7.8h\n" "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e40eccf // bfmmla v15.4s, v6.8h, v0.8h\n" + ".inst 0x6e40ec57 // bfmmla v23.4s, v2.8h, v0.8h\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n" ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" @@ -2502,48 +2501,48 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "blt 171f\n" "170:" // Height 5: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr d0, [x22], #0x8\n" + "ldr q1, [x10, #0x0]\n" + "trn1 v2.2d, v0.2d, v2.2d\n" + ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" "cmp x27, #0x4\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n" "bge 170b\n" "171:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 174f\n" @@ -2567,42 +2566,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr h4, [x23, #0x0]\n" "ldr h5, [x22, #0x0]\n" "173:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v5.2d, v0.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x6e46ece8 // bfmmla v8.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec70 // bfmmla v16.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n" "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n" + ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n" "174:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3088,16 +3087,16 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "201:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 202f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 203f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -3109,11 +3108,11 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "b 203f\n" "202:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "203:" // Height 6: input setup done "cmp x27, #0x8\n" "blt 206f\n" @@ -3180,42 +3179,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr q2, [x25, #0x0]\n" "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x6e40ec2c // bfmmla v12.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbc // bfmmla v28.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x6e46ec29 // bfmmla v9.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec71 // bfmmla v17.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecb9 // bfmmla v25.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x6e40ec2d // bfmmla v13.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbd // bfmmla v29.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x6e46ec2a // bfmmla v10.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec72 // bfmmla v18.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecba // bfmmla v26.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x6e40ec2e // bfmmla v14.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbe // bfmmla v30.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2b // bfmmla v11.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbb // bfmmla v27.4s, v5.8h, v6.8h\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e40ec2f // bfmmla v15.4s, v1.8h, v0.8h\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + ".inst 0x6e40ecbf // bfmmla v31.4s, v5.8h, v0.8h\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "bge 204b\n" @@ -3271,35 +3270,35 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x6e42ec2c // bfmmla v12.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec74 // bfmmla v20.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbc // bfmmla v28.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x6e40ec29 // bfmmla v9.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x6e42ec2d // bfmmla v13.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec75 // bfmmla v21.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbd // bfmmla v29.4s, v5.8h, v2.8h\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x6e40ec2a // bfmmla v10.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x6e42ec2e // bfmmla v14.4s, v1.8h, v2.8h\n" + ".inst 0x6e42ec76 // bfmmla v22.4s, v3.8h, v2.8h\n" + ".inst 0x6e42ecbe // bfmmla v30.4s, v5.8h, v2.8h\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e40ec2b // bfmmla v11.4s, v1.8h, v0.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ecbb // bfmmla v27.4s, v5.8h, v0.8h\n" ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" @@ -3309,49 +3308,49 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "blt 208f\n" "207:" // Height 6: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "cmp x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr d1, [x22], #0x8\n" + "ldr d0, [x21], #0x8\n" + "trn1 v2.2d, v1.2d, v0.2d\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e41ec88 // bfmmla v8.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec70 // bfmmla v16.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec58 // bfmmla v24.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec74 // bfmmla v20.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5c // bfmmla v28.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e41ec89 // bfmmla v9.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec71 // bfmmla v17.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec59 // bfmmla v25.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x6e40ec8d // bfmmla v13.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec75 // bfmmla v21.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5d // bfmmla v29.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e41ec8a // bfmmla v10.4s, v4.8h, v1.8h\n" + ".inst 0x6e41ec72 // bfmmla v18.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5a // bfmmla v26.4s, v2.8h, v1.8h\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e40ec8e // bfmmla v14.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5e // bfmmla v30.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec8b // bfmmla v11.4s, v4.8h, v6.8h\n" + ".inst 0x6e46ec73 // bfmmla v19.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + ".inst 0x6e40ec8f // bfmmla v15.4s, v4.8h, v0.8h\n" + ".inst 0x6e40ec77 // bfmmla v23.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5f // bfmmla v31.4s, v2.8h, v0.8h\n" "bge 207b\n" "208:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 211f\n" @@ -3378,42 +3377,42 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "ldr h5, [x22, #0x0]\n" "ldr h6, [x21, #0x0]\n" "210:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q0, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + ".inst 0x6e40ece8 // bfmmla v8.4s, v7.8h, v0.8h\n" + "trn1 v2.2d, v5.2d, v6.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x6e40ec70 // bfmmla v16.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec58 // bfmmla v24.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e41ecec // bfmmla v12.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec74 // bfmmla v20.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5c // bfmmla v28.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e40ece9 // bfmmla v9.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec71 // bfmmla v17.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec59 // bfmmla v25.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x6e41eced // bfmmla v13.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec75 // bfmmla v21.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e40ecea // bfmmla v10.4s, v7.8h, v0.8h\n" + ".inst 0x6e40ec72 // bfmmla v18.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5a // bfmmla v26.4s, v2.8h, v0.8h\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x6e41ecee // bfmmla v14.4s, v7.8h, v1.8h\n" + ".inst 0x6e41ec76 // bfmmla v22.4s, v3.8h, v1.8h\n" + ".inst 0x6e41ec5e // bfmmla v30.4s, v2.8h, v1.8h\n" "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" "add x10, x10, #0x80\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + ".inst 0x6e40ec73 // bfmmla v19.4s, v3.8h, v0.8h\n" + ".inst 0x6e40ec5b // bfmmla v27.4s, v2.8h, v0.8h\n" + ".inst 0x6e46ecef // bfmmla v15.4s, v7.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ec5f // bfmmla v31.4s, v2.8h, v6.8h\n" "211:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3678,7 +3677,6 @@ void a64_hybrid_bf16fp32_mmla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "224:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp index 8ce3d1b995..8b80c25beb 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -79,12 +79,12 @@ public: switch (ci->get_cpu_model()) { case CPUModel::A55r1: return { 6.94 }; + default: + return { 14.53 }; case CPUModel::A510: return { 8.94 }; case CPUModel::V1: return { 29.26 }; - default: - return { 14.53 }; } } @@ -108,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp index 19636548a0..b049ed45f9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp @@ -244,11 +244,11 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "23:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 24f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" "cbnz x15, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #1\n" @@ -265,222 +265,222 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "blt 27f\n" "26:" // Height 1: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr d6, [x17, #0x20]\n" - "ldr x12, [x17, #0x28]\n" + "ldr d17, [x17, #0x20]\n" + "ldr x20, [x17, #0x28]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x38]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr d6, [x17, #0x40]\n" - "ldr x12, [x17, #0x48]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x58]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr d6, [x17, #0x60]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x78]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr d6, [x17, #0x80]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x98]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr d6, [x17, #0xa0]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xb8]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr d6, [x17, #0xc0]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xd8]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr d6, [x17, #0xe0]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xf8]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr d6, [x17, #0x100]\n" - "ldr x12, [x17, #0x108]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr d7, [x17, #0x110]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x118]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr d6, [x17, #0x120]\n" - "ldr x12, [x17, #0x128]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr d7, [x17, #0x130]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x138]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr d6, [x17, #0x140]\n" - "ldr x12, [x17, #0x148]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr d7, [x17, #0x150]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x158]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr d6, [x17, #0x160]\n" - "ldr x12, [x17, #0x168]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr d7, [x17, #0x170]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x178]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr d6, [x17, #0x180]\n" - "ldr x12, [x17, #0x188]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr d7, [x17, #0x190]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x198]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr d6, [x17, #0x1a0]\n" - "ldr x12, [x17, #0x1a8]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr d7, [x17, #0x1b0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x1b8]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr d6, [x17, #0x1c0]\n" - "ldr x12, [x17, #0x1c8]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr d7, [x17, #0x1d0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x1d8]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr d6, [x17, #0x1e0]\n" - "ldr x12, [x17, #0x1e8]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr d7, [x17, #0x1f0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x1f8]\n" - "mov v7.d[1], x11\n" + "ldr d16, [x17, #0x30]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x38]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr d17, [x17, #0x40]\n" + "ldr x20, [x17, #0x48]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr d16, [x17, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x58]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr d17, [x17, #0x60]\n" + "ldr x20, [x17, #0x68]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr d16, [x17, #0x70]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x78]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr d17, [x17, #0x80]\n" + "ldr x20, [x17, #0x88]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr d16, [x17, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x98]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr d17, [x17, #0xa0]\n" + "ldr x20, [x17, #0xa8]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr d16, [x17, #0xb0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xb8]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr d17, [x17, #0xc0]\n" + "ldr x20, [x17, #0xc8]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr d16, [x17, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xd8]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr d17, [x17, #0xe0]\n" + "ldr x20, [x17, #0xe8]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr d16, [x17, #0xf0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xf8]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr d17, [x17, #0x100]\n" + "ldr x20, [x17, #0x108]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr d16, [x17, #0x110]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x118]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr d17, [x17, #0x120]\n" + "ldr x20, [x17, #0x128]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr d16, [x17, #0x130]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x138]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr d17, [x17, #0x140]\n" + "ldr x20, [x17, #0x148]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr d16, [x17, #0x150]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x158]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr d17, [x17, #0x160]\n" + "ldr x20, [x17, #0x168]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr d16, [x17, #0x170]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x178]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr d17, [x17, #0x180]\n" + "ldr x20, [x17, #0x188]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr d16, [x17, #0x190]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x198]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr d17, [x17, #0x1a0]\n" + "ldr x20, [x17, #0x1a8]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr d16, [x17, #0x1b0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x1b8]\n" + "mov v16.d[1], x20\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr d17, [x17, #0x1c0]\n" + "ldr x20, [x17, #0x1c8]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr d16, [x17, #0x1d0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x1d8]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr d17, [x17, #0x1e0]\n" + "ldr x20, [x17, #0x1e8]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr d16, [x17, #0x1f0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x1f8]\n" + "mov v16.d[1], x20\n" "add x13, x13, #0x10\n" "add x17, x17, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" "ldr d6, [x17, #0x0]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr x20, [x17, #0x8]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "ldr d0, [x13, #0x0]\n" "sub x14, x14, #0x8\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x10\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x18]\n" - "mov v0.d[1], x10\n" - "mov v7.d[1], x11\n" + "ldr x21, [x13, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x13, #0x80]\n" "bge 26b\n" "27:" // Height 1: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q17, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x17, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x17, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x17, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x17, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x17, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x17, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x17, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x17, #0x1f0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x17, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x17, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x17, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x17, #0x70]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x17, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x17, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x17, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x17, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x17, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x17, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x17, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x17, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x17, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x17, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x17, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x17, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x17, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x17, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x17, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x17, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x17, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x17, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x17, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x17, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x17, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x17, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr q17, [x17, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr q16, [x17, #0x1f0]\n" "add x13, x13, #0x10\n" "sub x14, x14, #0x8\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "add x17, x17, #0x200\n" "28:" // Height 1: Multiply loop: Main loop skip "cbz x14, 30f\n" "29:" // Height 1: Multiply loop: Odd block loop "ldr h0, [x13], #0x2\n" "sub x14, x14, #0x1\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q16, [x17, #0x0]\n" + "fmla v8.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x17, #0x10]\n" + "fmla v9.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x17, #0x20]\n" + "fmla v10.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" "add x17, x17, #0x40\n" "cbnz x14, 29b\n" "30:" // Height 1: Multiply loop: No odd multiplies @@ -491,17 +491,17 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "prfm pstl1keep, [x16, #0x0]\n" "tbz %x[flags], #1, 31f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v0.8h\n" - "fmin v9.8h, v9.8h, v0.8h\n" - "fmin v10.8h, v10.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v16.8h\n" + "fmin v9.8h, v9.8h, v16.8h\n" + "fmin v10.8h, v10.8h, v16.8h\n" + "fmin v11.8h, v11.8h, v16.8h\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" "31:" // Height 1: No activation "cmp x8, #0x20\n" "bge 48f\n" @@ -799,324 +799,324 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "72:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" "cbnz x15, 74f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #1\n" - "add x9, x9, x20, LSL #1\n" + "add x12, x12, x20, LSL #1\n" "b 74f\n" "73:" // Height 2: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #1\n" + "add x12, x13, x21, LSL #1\n" "74:" // Height 2: input setup done "cmp x14, #0x8\n" "blt 77f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x10\n" - "ldr q1, [x9, #0x0]\n" + "ldr q1, [x12, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 76f\n" "75:" // Height 2: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d17, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr x12, [x17, #0x48]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x58]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x98]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xd8]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr d6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr x12, [x17, #0x108]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr d7, [x17, #0x110]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x118]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr x12, [x17, #0x128]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr d6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr x11, [x17, #0x138]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr d7, [x17, #0x130]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr d6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr x12, [x17, #0x148]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr d7, [x17, #0x150]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x158]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr x12, [x17, #0x168]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr d6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr x11, [x17, #0x178]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr d7, [x17, #0x170]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr d6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr x12, [x17, #0x188]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr d7, [x17, #0x190]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x198]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr x12, [x17, #0x1a8]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr d6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr x11, [x17, #0x1b8]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr d7, [x17, #0x1b0]\n" - "mov v6.d[1], x12\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "mov v7.d[1], x11\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr d6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr x12, [x17, #0x1c8]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr d7, [x17, #0x1d0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x1d8]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr x12, [x17, #0x1e8]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr d6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr x11, [x17, #0x1f8]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr d7, [x17, #0x1f0]\n" - "mov v6.d[1], x12\n" + "ldr d16, [x17, #0x30]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr d17, [x17, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr x20, [x17, #0x48]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr d16, [x17, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x58]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr d17, [x17, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr d16, [x17, #0x70]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr d17, [x17, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr x20, [x17, #0x88]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr d16, [x17, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x98]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr d17, [x17, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr d16, [x17, #0xb0]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr d17, [x17, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr x20, [x17, #0xc8]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr d16, [x17, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xd8]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr d17, [x17, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr d16, [x17, #0xf0]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr d17, [x17, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr x20, [x17, #0x108]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr d16, [x17, #0x110]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x118]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr x21, [x17, #0x128]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr d17, [x17, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr x20, [x17, #0x138]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr d16, [x17, #0x130]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr d17, [x17, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr x20, [x17, #0x148]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr d16, [x17, #0x150]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x158]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr x21, [x17, #0x168]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr d17, [x17, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr x20, [x17, #0x178]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr d16, [x17, #0x170]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr d17, [x17, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr x20, [x17, #0x188]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr d16, [x17, #0x190]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x198]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr x21, [x17, #0x1a8]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr d17, [x17, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr x20, [x17, #0x1b8]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr d16, [x17, #0x1b0]\n" + "mov v17.d[1], x21\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "mov v16.d[1], x20\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr d17, [x17, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr x20, [x17, #0x1c8]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr d16, [x17, #0x1d0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x1d8]\n" + "mov v16.d[1], x20\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr x21, [x17, #0x1e8]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr d17, [x17, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr x20, [x17, #0x1f8]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr d16, [x17, #0x1f0]\n" + "mov v17.d[1], x21\n" "add x13, x13, #0x10\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" + "mov v16.d[1], x20\n" + "add x12, x12, #0x10\n" "add x17, x17, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" "ldr d6, [x17, #0x0]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr d1, [x9, #0x0]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" + "ldr d1, [x12, #0x0]\n" "sub x14, x14, #0x8\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x10\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x28, [x9, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x17, #0x18]\n" - "mov v1.d[1], x28\n" + "ldr x20, [x13, #0x8]\n" + "mov v6.d[1], x21\n" + "ldr x21, [x12, #0x8]\n" + "mov v0.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v1.d[1], x21\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v7.d[1], x11\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v7.d[1], x20\n" + "prfm pldl1keep, [x12, #0x80]\n" "bge 75b\n" "76:" // Height 2: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "add x13, x13, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q17, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" "sub x14, x14, #0x8\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr q17, [x17, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x17, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x17, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x17, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x17, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x17, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x17, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x17, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x17, #0x1f0]\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr q16, [x17, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x17, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x17, #0x70]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x17, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x17, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x17, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x17, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x17, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x17, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x17, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x17, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x17, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x17, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x17, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x17, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x17, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x17, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x17, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x17, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x17, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x17, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x17, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x17, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x17, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x17, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr q17, [x17, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr q16, [x17, #0x1f0]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" "add x17, x17, #0x200\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" "77:" // Height 2: Multiply loop: Main loop skip "cbz x14, 79f\n" "78:" // Height 2: Multiply loop: Odd block loop - "ldr h0, [x13], #0x2\n" + "ldr h1, [x13], #0x2\n" "sub x14, x14, #0x1\n" - "ldr h1, [x9], #0x2\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr h0, [x12], #0x2\n" + "ldr q17, [x17, #0x0]\n" + "fmla v8.8h, v17.8h, v1.h[0]\n" + "ldr q16, [x17, #0x10]\n" + "fmla v12.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x17, #0x20]\n" + "fmla v9.8h, v16.8h, v1.h[0]\n" + "fmla v13.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.8h, v17.8h, v1.h[0]\n" "add x17, x17, #0x40\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v14.8h, v17.8h, v0.h[0]\n" + "fmla v11.8h, v16.8h, v1.h[0]\n" + "fmla v15.8h, v16.8h, v0.h[0]\n" "cbnz x14, 78b\n" "79:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1129,25 +1129,25 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "prfm pstl1keep, [x25, #0x0]\n" "tbz %x[flags], #1, 80f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v0.8h\n" - "fmin v9.8h, v9.8h, v0.8h\n" - "fmin v10.8h, v10.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v0.8h\n" - "fmin v12.8h, v12.8h, v0.8h\n" - "fmin v13.8h, v13.8h, v0.8h\n" - "fmin v14.8h, v14.8h, v0.8h\n" - "fmin v15.8h, v15.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v16.8h\n" + "fmin v9.8h, v9.8h, v16.8h\n" + "fmin v10.8h, v10.8h, v16.8h\n" + "fmin v11.8h, v11.8h, v16.8h\n" + "fmin v12.8h, v12.8h, v16.8h\n" + "fmin v13.8h, v13.8h, v16.8h\n" + "fmin v14.8h, v14.8h, v16.8h\n" + "fmin v15.8h, v15.8h, v16.8h\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" + "fmax v12.8h, v12.8h, v16.8h\n" + "fmax v13.8h, v13.8h, v16.8h\n" + "fmax v14.8h, v14.8h, v16.8h\n" + "fmax v15.8h, v15.8h, v16.8h\n" "80:" // Height 2: No activation "cmp x8, #0x20\n" "bge 97f\n" @@ -1526,404 +1526,404 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "121:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 122f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" "cbnz x15, 123f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #1\n" - "add x9, x9, x20, LSL #1\n" - "add x27, x27, x20, LSL #1\n" + "add x12, x12, x20, LSL #1\n" + "add x11, x11, x20, LSL #1\n" "b 123f\n" "122:" // Height 3: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #1\n" - "add x27, x9, x20, LSL #1\n" + "add x12, x13, x21, LSL #1\n" + "add x11, x12, x21, LSL #1\n" "123:" // Height 3: input setup done "cmp x14, #0x8\n" "blt 126f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x10\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 125f\n" "124:" // Height 3: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d21, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" + "mov v21.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x108]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0x118]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr d6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x128]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr d7, [x17, #0x110]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x138]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr d6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x148]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr d7, [x17, #0x130]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x158]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr d6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x168]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr d7, [x17, #0x150]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x178]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr d6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x188]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr d7, [x17, #0x170]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x198]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr d6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x1a8]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr d7, [x17, #0x190]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1b8]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr d6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1c8]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr d7, [x17, #0x1b0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1d8]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr d6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1e8]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr d7, [x17, #0x1d0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x1f8]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr d6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr d20, [x17, #0x30]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr d21, [x17, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr d20, [x17, #0x50]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr d21, [x17, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr d20, [x17, #0x70]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr d21, [x17, #0x80]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr d20, [x17, #0x90]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr d21, [x17, #0xa0]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr d20, [x17, #0xb0]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr d21, [x17, #0xc0]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr d20, [x17, #0xd0]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr d21, [x17, #0xe0]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "ldr x21, [x17, #0x108]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr d20, [x17, #0xf0]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "ldr x20, [x17, #0x118]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr d21, [x17, #0x100]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "ldr x21, [x17, #0x128]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr d20, [x17, #0x110]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "ldr x20, [x17, #0x138]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr d21, [x17, #0x120]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "ldr x21, [x17, #0x148]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr d20, [x17, #0x130]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "ldr x20, [x17, #0x158]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr d21, [x17, #0x140]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "ldr x21, [x17, #0x168]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr d20, [x17, #0x150]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "ldr x20, [x17, #0x178]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr d21, [x17, #0x160]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "ldr x21, [x17, #0x188]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr d20, [x17, #0x170]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "ldr x20, [x17, #0x198]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr d21, [x17, #0x180]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "ldr x21, [x17, #0x1a8]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr d20, [x17, #0x190]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1b8]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr d21, [x17, #0x1a0]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1c8]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr d20, [x17, #0x1b0]\n" + "mov v20.d[1], x20\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1d8]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr d21, [x17, #0x1c0]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" + "mov v21.d[1], x21\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1e8]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr d20, [x17, #0x1d0]\n" + "mov v20.d[1], x20\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "ldr x20, [x17, #0x1f8]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr d21, [x17, #0x1e0]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" + "mov v21.d[1], x21\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" "add x13, x13, #0x10\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr d7, [x17, #0x1f0]\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr d20, [x17, #0x1f0]\n" + "mov v20.d[1], x20\n" + "add x12, x12, #0x10\n" + "add x11, x11, #0x10\n" "add x17, x17, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" + "ldr x20, [x17, #0x8]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "ldr x23, [x13, #0x8]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" "ldr d6, [x17, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr d1, [x9, #0x0]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr d2, [x27, #0x0]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" + "ldr d1, [x12, #0x0]\n" + "ldr x22, [x12, #0x8]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" + "ldr d2, [x11, #0x0]\n" "sub x14, x14, #0x8\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x10\n" - "ldr x26, [x27, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x18]\n" - "mov v0.d[1], x10\n" + "ldr x21, [x11, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v0.d[1], x23\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v1.d[1], x28\n" - "prfm pldl1keep, [x9, #0x80]\n" - "mov v2.d[1], x26\n" - "prfm pldl1keep, [x27, #0x80]\n" - "mov v7.d[1], x11\n" + "mov v1.d[1], x22\n" + "prfm pldl1keep, [x12, #0x80]\n" + "mov v2.d[1], x21\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v7.d[1], x20\n" "bge 124b\n" "125:" // Height 3: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "add x13, x13, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q21, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "sub x14, x14, #0x8\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q20, [x17, #0x30]\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x17, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x17, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x17, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x17, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x17, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x17, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x17, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x17, #0x1f0]\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr q21, [x17, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr q20, [x17, #0x50]\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x17, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x17, #0x70]\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x17, #0x80]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x17, #0x90]\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x17, #0xa0]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x17, #0xb0]\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x17, #0xc0]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x17, #0xd0]\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x17, #0xe0]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x17, #0xf0]\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x17, #0x100]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x17, #0x110]\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x17, #0x120]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x17, #0x130]\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x17, #0x140]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x17, #0x150]\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x17, #0x160]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x17, #0x170]\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x17, #0x180]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x17, #0x190]\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x17, #0x1a0]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x17, #0x1b0]\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x17, #0x1c0]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x17, #0x1d0]\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr q21, [x17, #0x1e0]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr q20, [x17, #0x1f0]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" "add x17, x17, #0x200\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" "126:" // Height 3: Multiply loop: Main loop skip "cbz x14, 128f\n" "127:" // Height 3: Multiply loop: Odd block loop - "ldr h0, [x13], #0x2\n" + "ldr h2, [x13], #0x2\n" "sub x14, x14, #0x1\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr h1, [x12], #0x2\n" + "ldr h0, [x11], #0x2\n" + "ldr q21, [x17, #0x0]\n" + "fmla v8.8h, v21.8h, v2.h[0]\n" + "ldr q20, [x17, #0x10]\n" + "fmla v12.8h, v21.8h, v1.h[0]\n" + "fmla v16.8h, v21.8h, v0.h[0]\n" + "ldr q21, [x17, #0x20]\n" + "fmla v9.8h, v20.8h, v2.h[0]\n" + "fmla v13.8h, v20.8h, v1.h[0]\n" + "fmla v17.8h, v20.8h, v0.h[0]\n" + "ldr q20, [x17, #0x30]\n" + "fmla v10.8h, v21.8h, v2.h[0]\n" "add x17, x17, #0x40\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" + "fmla v18.8h, v21.8h, v0.h[0]\n" + "fmla v11.8h, v20.8h, v2.h[0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v0.h[0]\n" "cbnz x14, 127b\n" "128:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1938,33 +1938,33 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "prfm pstl1keep, [x24, #0x0]\n" "tbz %x[flags], #1, 129f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v0.8h\n" - "fmin v9.8h, v9.8h, v0.8h\n" - "fmin v10.8h, v10.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v0.8h\n" - "fmin v12.8h, v12.8h, v0.8h\n" - "fmin v13.8h, v13.8h, v0.8h\n" - "fmin v14.8h, v14.8h, v0.8h\n" - "fmin v15.8h, v15.8h, v0.8h\n" - "fmin v16.8h, v16.8h, v0.8h\n" - "fmin v17.8h, v17.8h, v0.8h\n" - "fmin v18.8h, v18.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v0.8h\n" + "ld1r { v20.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v20.8h\n" + "fmin v9.8h, v9.8h, v20.8h\n" + "fmin v10.8h, v10.8h, v20.8h\n" + "fmin v11.8h, v11.8h, v20.8h\n" + "fmin v12.8h, v12.8h, v20.8h\n" + "fmin v13.8h, v13.8h, v20.8h\n" + "fmin v14.8h, v14.8h, v20.8h\n" + "fmin v15.8h, v15.8h, v20.8h\n" + "fmin v16.8h, v16.8h, v20.8h\n" + "fmin v17.8h, v17.8h, v20.8h\n" + "fmin v18.8h, v18.8h, v20.8h\n" + "fmin v19.8h, v19.8h, v20.8h\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" + "ld1r { v20.8h }, [x20]\n" + "fmax v8.8h, v8.8h, v20.8h\n" + "fmax v9.8h, v9.8h, v20.8h\n" + "fmax v10.8h, v10.8h, v20.8h\n" + "fmax v11.8h, v11.8h, v20.8h\n" + "fmax v12.8h, v12.8h, v20.8h\n" + "fmax v13.8h, v13.8h, v20.8h\n" + "fmax v14.8h, v14.8h, v20.8h\n" + "fmax v15.8h, v15.8h, v20.8h\n" + "fmax v16.8h, v16.8h, v20.8h\n" + "fmax v17.8h, v17.8h, v20.8h\n" + "fmax v18.8h, v18.8h, v20.8h\n" + "fmax v19.8h, v19.8h, v20.8h\n" "129:" // Height 3: No activation "cmp x8, #0x20\n" "bge 146f\n" @@ -2424,484 +2424,484 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "170:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 171f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" "cbnz x15, 172f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #1\n" - "add x9, x9, x20, LSL #1\n" - "add x27, x27, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" + "add x12, x12, x20, LSL #1\n" + "add x11, x11, x20, LSL #1\n" + "add x10, x10, x20, LSL #1\n" "b 172f\n" "171:" // Height 4: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #1\n" - "add x27, x9, x20, LSL #1\n" - "add x25, x27, x20, LSL #1\n" + "add x12, x13, x21, LSL #1\n" + "add x11, x12, x21, LSL #1\n" + "add x10, x11, x21, LSL #1\n" "172:" // Height 4: input setup done "cmp x14, #0x8\n" "blt 175f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x10\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 174f\n" "173:" // Height 4: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "add x13, x13, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d25, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" + "mov v25.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "add x27, x27, #0x10\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "add x25, x25, #0x10\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr x26, [x27, #0x8]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr x24, [x25, #0x8]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr d24, [x17, #0x30]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "add x11, x11, #0x10\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr d25, [x17, #0x40]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "add x10, x10, #0x10\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr d24, [x17, #0x50]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "ldr x25, [x13, #0x8]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr d25, [x17, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "ldr x24, [x12, #0x8]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr d24, [x17, #0x70]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "ldr x23, [x11, #0x8]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr d25, [x17, #0x80]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "ldr x22, [x10, #0x8]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr d24, [x17, #0x90]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" "sub x14, x14, #0x8\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr d25, [x17, #0xa0]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" "cmp x14, #0x10\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr d24, [x17, #0xb0]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x108]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0x118]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr d6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x128]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr d7, [x17, #0x110]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x138]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr d6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x148]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr d7, [x17, #0x130]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x158]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr d6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x168]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr d7, [x17, #0x150]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x178]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr d6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x188]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr d7, [x17, #0x170]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x198]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr d6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x1a8]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr d7, [x17, #0x190]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1b8]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr d6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1c8]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr d7, [x17, #0x1b0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1d8]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr d6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1e8]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr d7, [x17, #0x1d0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x1f8]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr d6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr d7, [x17, #0x1f0]\n" - "mov v7.d[1], x11\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr d25, [x17, #0xc0]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr d24, [x17, #0xd0]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr d25, [x17, #0xe0]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "ldr x21, [x17, #0x108]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr d24, [x17, #0xf0]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "ldr x20, [x17, #0x118]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr d25, [x17, #0x100]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "ldr x21, [x17, #0x128]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr d24, [x17, #0x110]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "ldr x20, [x17, #0x138]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr d25, [x17, #0x120]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "ldr x21, [x17, #0x148]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr d24, [x17, #0x130]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "ldr x20, [x17, #0x158]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr d25, [x17, #0x140]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "ldr x21, [x17, #0x168]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr d24, [x17, #0x150]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "ldr x20, [x17, #0x178]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr d25, [x17, #0x160]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "ldr x21, [x17, #0x188]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr d24, [x17, #0x170]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "ldr x20, [x17, #0x198]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr d25, [x17, #0x180]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "ldr x21, [x17, #0x1a8]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr d24, [x17, #0x190]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1b8]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr d25, [x17, #0x1a0]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1c8]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr d24, [x17, #0x1b0]\n" + "mov v24.d[1], x20\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1d8]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr d25, [x17, #0x1c0]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "mov v25.d[1], x21\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1e8]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr d24, [x17, #0x1d0]\n" + "mov v24.d[1], x20\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "ldr x20, [x17, #0x1f8]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr d25, [x17, #0x1e0]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "mov v25.d[1], x21\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr d24, [x17, #0x1f0]\n" + "mov v24.d[1], x20\n" "add x17, x17, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x18]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "ldr x20, [x17, #0x18]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" "ldr d6, [x17, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr d1, [x9, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr d2, [x27, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "ldr d3, [x25, #0x0]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" + "ldr d1, [x12, #0x0]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" + "ldr d2, [x11, #0x0]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" + "ldr d3, [x10, #0x0]\n" "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v7.d[1], x11\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x24\n" + "mov v2.d[1], x23\n" + "mov v3.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 173b\n" "174:" // Height 4: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "add x13, x13, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q25, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "sub x14, x14, #0x8\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x13, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x17, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x17, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x17, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x17, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x17, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x17, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x17, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x17, #0x1f0]\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" + "ldr q24, [x17, #0x30]\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr q25, [x17, #0x40]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr q24, [x17, #0x50]\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x17, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x17, #0x70]\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x17, #0x80]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x17, #0x90]\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x17, #0xa0]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x17, #0xb0]\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x17, #0xc0]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x17, #0xd0]\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x17, #0xe0]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x17, #0xf0]\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x17, #0x100]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x17, #0x110]\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x17, #0x120]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x17, #0x130]\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x17, #0x140]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x17, #0x150]\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x17, #0x160]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x17, #0x170]\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x17, #0x180]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x17, #0x190]\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x17, #0x1a0]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x17, #0x1b0]\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x17, #0x1c0]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x17, #0x1d0]\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr q25, [x17, #0x1e0]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr q24, [x17, #0x1f0]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" "add x17, x17, #0x200\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" "175:" // Height 4: Multiply loop: Main loop skip "cbz x14, 177f\n" "176:" // Height 4: Multiply loop: Odd block loop - "ldr h0, [x13], #0x2\n" + "ldr h3, [x13], #0x2\n" "sub x14, x14, #0x1\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr h2, [x12], #0x2\n" + "ldr h1, [x11], #0x2\n" + "ldr h0, [x10], #0x2\n" + "ldr q25, [x17, #0x0]\n" + "fmla v8.8h, v25.8h, v3.h[0]\n" + "ldr q24, [x17, #0x10]\n" + "fmla v12.8h, v25.8h, v2.h[0]\n" + "fmla v16.8h, v25.8h, v1.h[0]\n" + "fmla v20.8h, v25.8h, v0.h[0]\n" + "ldr q25, [x17, #0x20]\n" + "fmla v9.8h, v24.8h, v3.h[0]\n" + "fmla v13.8h, v24.8h, v2.h[0]\n" + "fmla v17.8h, v24.8h, v1.h[0]\n" + "fmla v21.8h, v24.8h, v0.h[0]\n" + "ldr q24, [x17, #0x30]\n" + "fmla v10.8h, v25.8h, v3.h[0]\n" "add x17, x17, #0x40\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v14.8h, v25.8h, v2.h[0]\n" + "fmla v18.8h, v25.8h, v1.h[0]\n" + "fmla v22.8h, v25.8h, v0.h[0]\n" + "fmla v11.8h, v24.8h, v3.h[0]\n" + "fmla v15.8h, v24.8h, v2.h[0]\n" + "fmla v19.8h, v24.8h, v1.h[0]\n" + "fmla v23.8h, v24.8h, v0.h[0]\n" "cbnz x14, 176b\n" "177:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2918,41 +2918,41 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 178f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v0.8h\n" - "fmin v9.8h, v9.8h, v0.8h\n" - "fmin v10.8h, v10.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v0.8h\n" - "fmin v12.8h, v12.8h, v0.8h\n" - "fmin v13.8h, v13.8h, v0.8h\n" - "fmin v14.8h, v14.8h, v0.8h\n" - "fmin v15.8h, v15.8h, v0.8h\n" - "fmin v16.8h, v16.8h, v0.8h\n" - "fmin v17.8h, v17.8h, v0.8h\n" - "fmin v18.8h, v18.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v0.8h\n" - "fmin v21.8h, v21.8h, v0.8h\n" - "fmin v22.8h, v22.8h, v0.8h\n" - "fmin v23.8h, v23.8h, v0.8h\n" + "ld1r { v24.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v24.8h\n" + "fmin v9.8h, v9.8h, v24.8h\n" + "fmin v10.8h, v10.8h, v24.8h\n" + "fmin v11.8h, v11.8h, v24.8h\n" + "fmin v12.8h, v12.8h, v24.8h\n" + "fmin v13.8h, v13.8h, v24.8h\n" + "fmin v14.8h, v14.8h, v24.8h\n" + "fmin v15.8h, v15.8h, v24.8h\n" + "fmin v16.8h, v16.8h, v24.8h\n" + "fmin v17.8h, v17.8h, v24.8h\n" + "fmin v18.8h, v18.8h, v24.8h\n" + "fmin v19.8h, v19.8h, v24.8h\n" + "fmin v20.8h, v20.8h, v24.8h\n" + "fmin v21.8h, v21.8h, v24.8h\n" + "fmin v22.8h, v22.8h, v24.8h\n" + "fmin v23.8h, v23.8h, v24.8h\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" + "ld1r { v24.8h }, [x20]\n" + "fmax v8.8h, v8.8h, v24.8h\n" + "fmax v9.8h, v9.8h, v24.8h\n" + "fmax v10.8h, v10.8h, v24.8h\n" + "fmax v11.8h, v11.8h, v24.8h\n" + "fmax v12.8h, v12.8h, v24.8h\n" + "fmax v13.8h, v13.8h, v24.8h\n" + "fmax v14.8h, v14.8h, v24.8h\n" + "fmax v15.8h, v15.8h, v24.8h\n" + "fmax v16.8h, v16.8h, v24.8h\n" + "fmax v17.8h, v17.8h, v24.8h\n" + "fmax v18.8h, v18.8h, v24.8h\n" + "fmax v19.8h, v19.8h, v24.8h\n" + "fmax v20.8h, v20.8h, v24.8h\n" + "fmax v21.8h, v21.8h, v24.8h\n" + "fmax v22.8h, v22.8h, v24.8h\n" + "fmax v23.8h, v23.8h, v24.8h\n" "178:" // Height 4: No activation "cmp x8, #0x20\n" "bge 195f\n" @@ -3382,675 +3382,675 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "ld1 { v25.h }[2], [x22]\n" "b 215f\n" "210:" // Height 5: Partial accumulate: partial_1_8 - "mov x20, #0x10\n" - "tbz x8, #0, 215f\n" - "ldr h9, [x16, #0x0]\n" - "ldr h13, [x25, #0x0]\n" - "ldr h17, [x24, #0x0]\n" - "ldr h21, [x23, #0x0]\n" - "ldr h25, [x22, #0x0]\n" - "b 215f\n" - "211:" // Height 5: Partial accumulate: partial_4_0 - "tbz x8, #2, 213f\n" - "ldr d8, [x16], #0x8\n" - "ldr d12, [x25], #0x8\n" - "ldr d16, [x24], #0x8\n" - "ldr d20, [x23], #0x8\n" - "ldr d24, [x22], #0x8\n" - "tbz x8, #1, 212f\n" - "ld1 { v8.s }[2], [x16], #0x4\n" - "mov x20, #0xc\n" - "ld1 { v12.s }[2], [x25], #0x4\n" - "ld1 { v16.s }[2], [x24], #0x4\n" - "ld1 { v20.s }[2], [x23], #0x4\n" - "ld1 { v24.s }[2], [x22], #0x4\n" - "tbz x8, #0, 215f\n" - "ld1 { v8.h }[6], [x16]\n" - "ld1 { v12.h }[6], [x25]\n" - "ld1 { v16.h }[6], [x24]\n" - "ld1 { v20.h }[6], [x23]\n" - "ld1 { v24.h }[6], [x22]\n" - "b 215f\n" - "212:" // Height 5: Partial accumulate: partial_1_4 - "mov x20, #0x8\n" - "tbz x8, #0, 215f\n" - "ld1 { v8.h }[4], [x16]\n" - "ld1 { v12.h }[4], [x25]\n" - "ld1 { v16.h }[4], [x24]\n" - "ld1 { v20.h }[4], [x23]\n" - "ld1 { v24.h }[4], [x22]\n" - "b 215f\n" - "213:" // Height 5: Partial accumulate: partial_2_0 - "tbz x8, #1, 214f\n" - "ldr s8, [x16], #0x4\n" - "mov x20, #0x4\n" - "ldr s12, [x25], #0x4\n" - "ldr s16, [x24], #0x4\n" - "ldr s20, [x23], #0x4\n" - "ldr s24, [x22], #0x4\n" - "tbz x8, #0, 215f\n" - "ld1 { v8.h }[2], [x16]\n" - "ld1 { v12.h }[2], [x25]\n" - "ld1 { v16.h }[2], [x24]\n" - "ld1 { v20.h }[2], [x23]\n" - "ld1 { v24.h }[2], [x22]\n" - "b 215f\n" - "214:" // Height 5: Partial accumulate: partial_1_0 - "ldr h8, [x16, #0x0]\n" - "mov x20, #0x0\n" - "ldr h12, [x25, #0x0]\n" - "ldr h16, [x24, #0x0]\n" - "ldr h20, [x23, #0x0]\n" - "ldr h24, [x22, #0x0]\n" - "215:" // Height 5: Partial accumulate: Done - "sub x16, x16, x20\n" - "b 218f\n" - "216:" // Height 5: full accumulate - "ldr q8, [x16, #0x0]\n" - "ldr q9, [x16, #0x10]\n" - "ldr q10, [x16, #0x20]\n" - "ldr q11, [x16, #0x30]\n" - "ldr q12, [x25, #0x0]\n" - "ldr q13, [x25, #0x10]\n" - "ldr q14, [x25, #0x20]\n" - "ldr q15, [x25, #0x30]\n" - "ldr q16, [x24, #0x0]\n" - "ldr q17, [x24, #0x10]\n" - "ldr q18, [x24, #0x20]\n" - "ldr q19, [x24, #0x30]\n" - "ldr q20, [x23, #0x0]\n" - "ldr q21, [x23, #0x10]\n" - "ldr q22, [x23, #0x20]\n" - "ldr q23, [x23, #0x30]\n" - "ldr q24, [x22, #0x0]\n" - "ldr q25, [x22, #0x10]\n" - "ldr q26, [x22, #0x20]\n" - "ldr q27, [x22, #0x30]\n" - "b 218f\n" - "217:" // Height 5: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "218:" // Height 5: setup done - "mov x15, #0x0\n" - "219:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 220f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "cbnz x15, 221f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x13, x13, x20, LSL #1\n" - "add x9, x9, x20, LSL #1\n" - "add x27, x27, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "b 221f\n" - "220:" // Height 5: setup direct input - "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #1\n" - "add x27, x9, x20, LSL #1\n" - "add x25, x27, x20, LSL #1\n" - "add x23, x25, x20, LSL #1\n" - "221:" // Height 5: input setup done - "cmp x14, #0x8\n" - "blt 224f\n" - "ldr q0, [x13, #0x0]\n" - "cmp x14, #0x10\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q6, [x17, #0x0]\n" - "ldr q7, [x17, #0x10]\n" - "blt 223f\n" - "222:" // Height 5: Multiply loop: Main loop head - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr x12, [x17, #0x28]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x38]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x13, x13, #0x10\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x9, x9, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr d6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x48]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x27, x27, #0x10\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "add x25, x25, #0x10\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "add x23, x23, #0x10\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr x26, [x27, #0x8]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr x24, [x25, #0x8]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr x22, [x23, #0x8]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "sub x14, x14, #0x8\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "cmp x14, #0x10\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "prfm pldl1keep, [x13, #0x80]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x108]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0x118]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr d6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x128]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr d7, [x17, #0x110]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x138]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr d6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x148]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr d7, [x17, #0x130]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x158]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr d6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x168]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr d7, [x17, #0x150]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x178]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr d6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x188]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr d7, [x17, #0x170]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x198]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr d6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x1a8]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr d7, [x17, #0x190]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1b8]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr d6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1c8]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr d7, [x17, #0x1b0]\n" - "mov v7.d[1], x11\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1d8]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr d6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1e8]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr d7, [x17, #0x1d0]\n" - "mov v7.d[1], x11\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x1f8]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr d6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "mov v6.d[1], x12\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr d7, [x17, #0x1f0]\n" - "mov v7.d[1], x11\n" - "add x17, x17, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x18]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "ldr d6, [x17, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "ldr d0, [x13, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr d1, [x9, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr d2, [x27, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "ldr d3, [x25, #0x0]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "ldr d4, [x23, #0x0]\n" - "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v7.d[1], x11\n" - "bge 222b\n" - "223:" // Height 5: Multiply loop: Single iteration only - "fmla v8.8h, v6.8h, v0.h[0]\n" - "add x13, x13, #0x10\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x9, x9, #0x10\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x27, x27, #0x10\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x25, x25, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x23, x23, #0x10\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "sub x14, x14, #0x8\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "prfm pldl1keep, [x13, #0x80]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x17, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x17, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x17, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x17, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x17, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x17, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x17, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x17, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x17, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x17, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x17, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x17, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x17, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x17, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x17, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x17, #0x1f0]\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "add x17, x17, #0x200\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "224:" // Height 5: Multiply loop: Main loop skip - "cbz x14, 226f\n" - "225:" // Height 5: Multiply loop: Odd block loop - "ldr h0, [x13], #0x2\n" - "sub x14, x14, #0x1\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" + "mov x20, #0x10\n" + "tbz x8, #0, 215f\n" + "ldr h9, [x16, #0x0]\n" + "ldr h13, [x25, #0x0]\n" + "ldr h17, [x24, #0x0]\n" + "ldr h21, [x23, #0x0]\n" + "ldr h25, [x22, #0x0]\n" + "b 215f\n" + "211:" // Height 5: Partial accumulate: partial_4_0 + "tbz x8, #2, 213f\n" + "ldr d8, [x16], #0x8\n" + "ldr d12, [x25], #0x8\n" + "ldr d16, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "ldr d24, [x22], #0x8\n" + "tbz x8, #1, 212f\n" + "ld1 { v8.s }[2], [x16], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[6], [x16]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "b 215f\n" + "212:" // Height 5: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[4], [x16]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "b 215f\n" + "213:" // Height 5: Partial accumulate: partial_2_0 + "tbz x8, #1, 214f\n" + "ldr s8, [x16], #0x4\n" + "mov x20, #0x4\n" + "ldr s12, [x25], #0x4\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "tbz x8, #0, 215f\n" + "ld1 { v8.h }[2], [x16]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "b 215f\n" + "214:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x16, #0x0]\n" + "mov x20, #0x0\n" + "ldr h12, [x25, #0x0]\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "215:" // Height 5: Partial accumulate: Done + "sub x16, x16, x20\n" + "b 218f\n" + "216:" // Height 5: full accumulate + "ldr q8, [x16, #0x0]\n" + "ldr q9, [x16, #0x10]\n" + "ldr q10, [x16, #0x20]\n" + "ldr q11, [x16, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "b 218f\n" + "217:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "218:" // Height 5: setup done + "mov x15, #0x0\n" + "219:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w14, [x20, x15, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 220f\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "cbnz x15, 221f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x13, x13, x20, LSL #1\n" + "add x12, x12, x20, LSL #1\n" + "add x11, x11, x20, LSL #1\n" + "add x10, x10, x20, LSL #1\n" + "add x9, x9, x20, LSL #1\n" + "b 221f\n" + "220:" // Height 5: setup direct input + "mov x13, %x[input_ptr]\n" + "add x12, x13, x21, LSL #1\n" + "add x11, x12, x21, LSL #1\n" + "add x10, x11, x21, LSL #1\n" + "add x9, x10, x21, LSL #1\n" + "221:" // Height 5: input setup done + "cmp x14, #0x8\n" + "blt 224f\n" + "ldr q0, [x13, #0x0]\n" + "cmp x14, #0x10\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" "ldr q7, [x17, #0x10]\n" + "blt 223f\n" + "222:" // Height 5: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x13, x13, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x12, x12, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr d29, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" + "mov v29.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" + "add x11, x11, #0x10\n" "fmla v21.8h, v7.8h, v3.h[0]\n" + "add x10, x10, #0x10\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr d28, [x17, #0x30]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "add x9, x9, #0x10\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "ldr x26, [x13, #0x8]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr d29, [x17, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "ldr x25, [x12, #0x8]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "ldr x24, [x11, #0x8]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr d28, [x17, #0x50]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "ldr x23, [x10, #0x8]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "ldr x22, [x9, #0x8]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr d29, [x17, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "sub x14, x14, #0x8\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "cmp x14, #0x10\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr d28, [x17, #0x70]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "prfm pldl1keep, [x13, #0x80]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr d29, [x17, #0x80]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr d28, [x17, #0x90]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr d29, [x17, #0xa0]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr d28, [x17, #0xb0]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr d29, [x17, #0xc0]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr d28, [x17, #0xd0]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr d29, [x17, #0xe0]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "ldr x21, [x17, #0x108]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr d28, [x17, #0xf0]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "ldr x20, [x17, #0x118]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr d29, [x17, #0x100]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "ldr x21, [x17, #0x128]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr d28, [x17, #0x110]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "ldr x20, [x17, #0x138]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr d29, [x17, #0x120]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "ldr x21, [x17, #0x148]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr d28, [x17, #0x130]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "ldr x20, [x17, #0x158]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr d29, [x17, #0x140]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "ldr x21, [x17, #0x168]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr d28, [x17, #0x150]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "ldr x20, [x17, #0x178]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr d29, [x17, #0x160]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "ldr x21, [x17, #0x188]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr d28, [x17, #0x170]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "ldr x20, [x17, #0x198]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr d29, [x17, #0x180]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "ldr x21, [x17, #0x1a8]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr d28, [x17, #0x190]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1b8]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr d29, [x17, #0x1a0]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1c8]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr d28, [x17, #0x1b0]\n" + "mov v28.d[1], x20\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "ldr x20, [x17, #0x1d8]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr d29, [x17, #0x1c0]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" + "mov v29.d[1], x21\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "ldr x21, [x17, #0x1e8]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr d28, [x17, #0x1d0]\n" + "mov v28.d[1], x20\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "ldr x20, [x17, #0x1f8]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr d29, [x17, #0x1e0]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" + "mov v29.d[1], x21\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr d28, [x17, #0x1f0]\n" + "mov v28.d[1], x20\n" + "add x17, x17, #0x200\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "ldr x20, [x17, #0x18]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" + "ldr d6, [x17, #0x0]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "ldr d0, [x13, #0x0]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "ldr d1, [x12, #0x0]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "ldr d2, [x11, #0x0]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "ldr d3, [x10, #0x0]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "ldr d4, [x9, #0x0]\n" + "ldr d7, [x17, #0x10]\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x26\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x22\n" + "mov v7.d[1], x20\n" + "bge 222b\n" + "223:" // Height 5: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "add x13, x13, #0x10\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x12, x12, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "add x11, x11, #0x10\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x10, x10, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q29, [x17, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x9, x9, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "sub x14, x14, #0x8\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x13, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q28, [x17, #0x30]\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr q29, [x17, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr q28, [x17, #0x50]\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x17, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x17, #0x70]\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x17, #0x80]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x17, #0x90]\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x17, #0xa0]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x17, #0xb0]\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x17, #0xc0]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x17, #0xd0]\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x17, #0xe0]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x17, #0xf0]\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x17, #0x100]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x17, #0x110]\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x17, #0x120]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x17, #0x130]\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x17, #0x140]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x17, #0x150]\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x17, #0x160]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x17, #0x170]\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x17, #0x180]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x17, #0x190]\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x17, #0x1a0]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x17, #0x1b0]\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x17, #0x1c0]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x17, #0x1d0]\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr q29, [x17, #0x1e0]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr q28, [x17, #0x1f0]\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "add x17, x17, #0x200\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "224:" // Height 5: Multiply loop: Main loop skip + "cbz x14, 226f\n" + "225:" // Height 5: Multiply loop: Odd block loop + "ldr h4, [x13], #0x2\n" + "sub x14, x14, #0x1\n" + "ldr h3, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h1, [x10], #0x2\n" + "ldr h0, [x9], #0x2\n" + "ldr q29, [x17, #0x0]\n" + "fmla v8.8h, v29.8h, v4.h[0]\n" + "ldr q28, [x17, #0x10]\n" + "fmla v12.8h, v29.8h, v3.h[0]\n" + "fmla v16.8h, v29.8h, v2.h[0]\n" + "fmla v20.8h, v29.8h, v1.h[0]\n" + "fmla v24.8h, v29.8h, v0.h[0]\n" + "ldr q29, [x17, #0x20]\n" + "fmla v9.8h, v28.8h, v4.h[0]\n" + "fmla v13.8h, v28.8h, v3.h[0]\n" + "fmla v17.8h, v28.8h, v2.h[0]\n" + "fmla v21.8h, v28.8h, v1.h[0]\n" + "fmla v25.8h, v28.8h, v0.h[0]\n" + "ldr q28, [x17, #0x30]\n" + "fmla v10.8h, v29.8h, v4.h[0]\n" "add x17, x17, #0x40\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v14.8h, v29.8h, v3.h[0]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v1.h[0]\n" + "fmla v26.8h, v29.8h, v0.h[0]\n" + "fmla v11.8h, v28.8h, v4.h[0]\n" + "fmla v15.8h, v28.8h, v3.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v1.h[0]\n" + "fmla v27.8h, v28.8h, v0.h[0]\n" "cbnz x14, 225b\n" "226:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -4069,49 +4069,49 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 227f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v0.8h\n" - "fmin v9.8h, v9.8h, v0.8h\n" - "fmin v10.8h, v10.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v0.8h\n" - "fmin v12.8h, v12.8h, v0.8h\n" - "fmin v13.8h, v13.8h, v0.8h\n" - "fmin v14.8h, v14.8h, v0.8h\n" - "fmin v15.8h, v15.8h, v0.8h\n" - "fmin v16.8h, v16.8h, v0.8h\n" - "fmin v17.8h, v17.8h, v0.8h\n" - "fmin v18.8h, v18.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v0.8h\n" - "fmin v21.8h, v21.8h, v0.8h\n" - "fmin v22.8h, v22.8h, v0.8h\n" - "fmin v23.8h, v23.8h, v0.8h\n" - "fmin v24.8h, v24.8h, v0.8h\n" - "fmin v25.8h, v25.8h, v0.8h\n" - "fmin v26.8h, v26.8h, v0.8h\n" - "fmin v27.8h, v27.8h, v0.8h\n" + "ld1r { v28.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v28.8h\n" + "fmin v9.8h, v9.8h, v28.8h\n" + "fmin v10.8h, v10.8h, v28.8h\n" + "fmin v11.8h, v11.8h, v28.8h\n" + "fmin v12.8h, v12.8h, v28.8h\n" + "fmin v13.8h, v13.8h, v28.8h\n" + "fmin v14.8h, v14.8h, v28.8h\n" + "fmin v15.8h, v15.8h, v28.8h\n" + "fmin v16.8h, v16.8h, v28.8h\n" + "fmin v17.8h, v17.8h, v28.8h\n" + "fmin v18.8h, v18.8h, v28.8h\n" + "fmin v19.8h, v19.8h, v28.8h\n" + "fmin v20.8h, v20.8h, v28.8h\n" + "fmin v21.8h, v21.8h, v28.8h\n" + "fmin v22.8h, v22.8h, v28.8h\n" + "fmin v23.8h, v23.8h, v28.8h\n" + "fmin v24.8h, v24.8h, v28.8h\n" + "fmin v25.8h, v25.8h, v28.8h\n" + "fmin v26.8h, v26.8h, v28.8h\n" + "fmin v27.8h, v27.8h, v28.8h\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmax v27.8h, v27.8h, v0.8h\n" + "ld1r { v28.8h }, [x20]\n" + "fmax v8.8h, v8.8h, v28.8h\n" + "fmax v9.8h, v9.8h, v28.8h\n" + "fmax v10.8h, v10.8h, v28.8h\n" + "fmax v11.8h, v11.8h, v28.8h\n" + "fmax v12.8h, v12.8h, v28.8h\n" + "fmax v13.8h, v13.8h, v28.8h\n" + "fmax v14.8h, v14.8h, v28.8h\n" + "fmax v15.8h, v15.8h, v28.8h\n" + "fmax v16.8h, v16.8h, v28.8h\n" + "fmax v17.8h, v17.8h, v28.8h\n" + "fmax v18.8h, v18.8h, v28.8h\n" + "fmax v19.8h, v19.8h, v28.8h\n" + "fmax v20.8h, v20.8h, v28.8h\n" + "fmax v21.8h, v21.8h, v28.8h\n" + "fmax v22.8h, v22.8h, v28.8h\n" + "fmax v23.8h, v23.8h, v28.8h\n" + "fmax v24.8h, v24.8h, v28.8h\n" + "fmax v25.8h, v25.8h, v28.8h\n" + "fmax v26.8h, v26.8h, v28.8h\n" + "fmax v27.8h, v27.8h, v28.8h\n" "227:" // Height 5: No activation "cmp x8, #0x20\n" "bge 244f\n" @@ -4736,98 +4736,98 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "268:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 269f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "ldr x28, [x20, #0x28]\n" "cbnz x15, 270f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #1\n" + "add x12, x12, x20, LSL #1\n" + "add x11, x11, x20, LSL #1\n" + "add x10, x10, x20, LSL #1\n" "add x9, x9, x20, LSL #1\n" - "add x27, x27, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "add x21, x21, x20, LSL #1\n" + "add x28, x28, x20, LSL #1\n" "b 270f\n" "269:" // Height 6: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #1\n" - "add x27, x9, x20, LSL #1\n" - "add x25, x27, x20, LSL #1\n" - "add x23, x25, x20, LSL #1\n" - "add x21, x23, x20, LSL #1\n" + "add x12, x13, x21, LSL #1\n" + "add x11, x12, x21, LSL #1\n" + "add x10, x11, x21, LSL #1\n" + "add x9, x10, x21, LSL #1\n" + "add x28, x9, x21, LSL #1\n" "270:" // Height 6: input setup done "cmp x14, #0x8\n" "blt 273f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x10\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x21, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x28, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 272f\n" "271:" // Height 6: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "add x13, x13, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v28.8h, v6.8h, v5.h[0]\n" "ldr d6, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v29.8h, v7.8h, v5.h[0]\n" "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[0]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr x11, [x17, #0x58]\n" + "ldr x20, [x17, #0x58]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr x10, [x13, #0x8]\n" + "ldr x27, [x13, #0x8]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x26, [x12, #0x8]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr x26, [x27, #0x8]\n" + "ldr x25, [x11, #0x8]\n" "fmla v30.8h, v6.8h, v5.h[0]\n" "ldr d6, [x17, #0x40]\n" "fmla v11.8h, v7.8h, v0.h[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr x12, [x17, #0x68]\n" + "ldr x21, [x17, #0x68]\n" "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x24, [x10, #0x8]\n" "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x23, [x9, #0x8]\n" "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr x20, [x21, #0x8]\n" + "ldr x22, [x28, #0x8]\n" "fmla v31.8h, v7.8h, v5.h[0]\n" "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[1]\n" "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x78]\n" + "ldr x20, [x17, #0x78]\n" "fmla v16.8h, v6.8h, v2.h[1]\n" "sub x14, x14, #0x8\n" "fmla v20.8h, v6.8h, v3.h[1]\n" @@ -4837,240 +4837,240 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "fmla v28.8h, v6.8h, v5.h[1]\n" "ldr d6, [x17, #0x60]\n" "fmla v9.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0x88]\n" + "ldr x21, [x17, #0x88]\n" "fmla v17.8h, v7.8h, v2.h[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v25.8h, v7.8h, v4.h[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v29.8h, v7.8h, v5.h[1]\n" "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[1]\n" "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr x11, [x17, #0x98]\n" + "ldr x20, [x17, #0x98]\n" "fmla v18.8h, v6.8h, v2.h[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v22.8h, v6.8h, v3.h[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v26.8h, v6.8h, v4.h[1]\n" "fmla v30.8h, v6.8h, v5.h[1]\n" "ldr d6, [x17, #0x80]\n" "fmla v11.8h, v7.8h, v0.h[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr x12, [x17, #0xa8]\n" + "ldr x21, [x17, #0xa8]\n" "fmla v19.8h, v7.8h, v2.h[1]\n" "fmla v23.8h, v7.8h, v3.h[1]\n" "fmla v27.8h, v7.8h, v4.h[1]\n" "fmla v31.8h, v7.8h, v5.h[1]\n" "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[2]\n" "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xb8]\n" + "ldr x20, [x17, #0xb8]\n" "fmla v16.8h, v6.8h, v2.h[2]\n" "fmla v20.8h, v6.8h, v3.h[2]\n" "fmla v24.8h, v6.8h, v4.h[2]\n" "fmla v28.8h, v6.8h, v5.h[2]\n" "ldr d6, [x17, #0xa0]\n" "fmla v9.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xc8]\n" + "ldr x21, [x17, #0xc8]\n" "fmla v17.8h, v7.8h, v2.h[2]\n" "fmla v21.8h, v7.8h, v3.h[2]\n" "fmla v25.8h, v7.8h, v4.h[2]\n" "fmla v29.8h, v7.8h, v5.h[2]\n" "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[2]\n" "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr x11, [x17, #0xd8]\n" + "ldr x20, [x17, #0xd8]\n" "fmla v18.8h, v6.8h, v2.h[2]\n" "fmla v22.8h, v6.8h, v3.h[2]\n" "fmla v26.8h, v6.8h, v4.h[2]\n" "fmla v30.8h, v6.8h, v5.h[2]\n" "ldr d6, [x17, #0xc0]\n" "fmla v11.8h, v7.8h, v0.h[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr x12, [x17, #0xe8]\n" + "ldr x21, [x17, #0xe8]\n" "fmla v19.8h, v7.8h, v2.h[2]\n" "fmla v23.8h, v7.8h, v3.h[2]\n" "fmla v27.8h, v7.8h, v4.h[2]\n" "fmla v31.8h, v7.8h, v5.h[2]\n" "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[3]\n" "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0xf8]\n" + "ldr x20, [x17, #0xf8]\n" "fmla v16.8h, v6.8h, v2.h[3]\n" "fmla v20.8h, v6.8h, v3.h[3]\n" "fmla v24.8h, v6.8h, v4.h[3]\n" "fmla v28.8h, v6.8h, v5.h[3]\n" "ldr d6, [x17, #0xe0]\n" "fmla v9.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x108]\n" + "ldr x21, [x17, #0x108]\n" "fmla v17.8h, v7.8h, v2.h[3]\n" "fmla v21.8h, v7.8h, v3.h[3]\n" "fmla v25.8h, v7.8h, v4.h[3]\n" "fmla v29.8h, v7.8h, v5.h[3]\n" "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[3]\n" "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr x11, [x17, #0x118]\n" + "ldr x20, [x17, #0x118]\n" "fmla v18.8h, v6.8h, v2.h[3]\n" "fmla v22.8h, v6.8h, v3.h[3]\n" "fmla v26.8h, v6.8h, v4.h[3]\n" "fmla v30.8h, v6.8h, v5.h[3]\n" "ldr d6, [x17, #0x100]\n" "fmla v11.8h, v7.8h, v0.h[3]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr x12, [x17, #0x128]\n" + "ldr x21, [x17, #0x128]\n" "fmla v19.8h, v7.8h, v2.h[3]\n" "fmla v23.8h, v7.8h, v3.h[3]\n" "fmla v27.8h, v7.8h, v4.h[3]\n" "fmla v31.8h, v7.8h, v5.h[3]\n" "ldr d7, [x17, #0x110]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[4]\n" "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x138]\n" + "ldr x20, [x17, #0x138]\n" "fmla v16.8h, v6.8h, v2.h[4]\n" "fmla v20.8h, v6.8h, v3.h[4]\n" "fmla v24.8h, v6.8h, v4.h[4]\n" "fmla v28.8h, v6.8h, v5.h[4]\n" "ldr d6, [x17, #0x120]\n" "fmla v9.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x148]\n" + "ldr x21, [x17, #0x148]\n" "fmla v17.8h, v7.8h, v2.h[4]\n" "fmla v21.8h, v7.8h, v3.h[4]\n" "fmla v25.8h, v7.8h, v4.h[4]\n" "fmla v29.8h, v7.8h, v5.h[4]\n" "ldr d7, [x17, #0x130]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[4]\n" "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr x11, [x17, #0x158]\n" + "ldr x20, [x17, #0x158]\n" "fmla v18.8h, v6.8h, v2.h[4]\n" "fmla v22.8h, v6.8h, v3.h[4]\n" "fmla v26.8h, v6.8h, v4.h[4]\n" "fmla v30.8h, v6.8h, v5.h[4]\n" "ldr d6, [x17, #0x140]\n" "fmla v11.8h, v7.8h, v0.h[4]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr x12, [x17, #0x168]\n" + "ldr x21, [x17, #0x168]\n" "fmla v19.8h, v7.8h, v2.h[4]\n" "fmla v23.8h, v7.8h, v3.h[4]\n" "fmla v27.8h, v7.8h, v4.h[4]\n" "fmla v31.8h, v7.8h, v5.h[4]\n" "ldr d7, [x17, #0x150]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[5]\n" "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x178]\n" + "ldr x20, [x17, #0x178]\n" "fmla v16.8h, v6.8h, v2.h[5]\n" "fmla v20.8h, v6.8h, v3.h[5]\n" "fmla v24.8h, v6.8h, v4.h[5]\n" "fmla v28.8h, v6.8h, v5.h[5]\n" "ldr d6, [x17, #0x160]\n" "fmla v9.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x188]\n" + "ldr x21, [x17, #0x188]\n" "fmla v17.8h, v7.8h, v2.h[5]\n" "fmla v21.8h, v7.8h, v3.h[5]\n" "fmla v25.8h, v7.8h, v4.h[5]\n" "fmla v29.8h, v7.8h, v5.h[5]\n" "ldr d7, [x17, #0x170]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[5]\n" "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr x11, [x17, #0x198]\n" + "ldr x20, [x17, #0x198]\n" "fmla v18.8h, v6.8h, v2.h[5]\n" "fmla v22.8h, v6.8h, v3.h[5]\n" "fmla v26.8h, v6.8h, v4.h[5]\n" "fmla v30.8h, v6.8h, v5.h[5]\n" "ldr d6, [x17, #0x180]\n" "fmla v11.8h, v7.8h, v0.h[5]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr x12, [x17, #0x1a8]\n" + "ldr x21, [x17, #0x1a8]\n" "fmla v19.8h, v7.8h, v2.h[5]\n" "fmla v23.8h, v7.8h, v3.h[5]\n" "fmla v27.8h, v7.8h, v4.h[5]\n" "fmla v31.8h, v7.8h, v5.h[5]\n" "ldr d7, [x17, #0x190]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[6]\n" "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1b8]\n" + "ldr x20, [x17, #0x1b8]\n" "fmla v16.8h, v6.8h, v2.h[6]\n" "fmla v20.8h, v6.8h, v3.h[6]\n" "fmla v24.8h, v6.8h, v4.h[6]\n" "fmla v28.8h, v6.8h, v5.h[6]\n" "ldr d6, [x17, #0x1a0]\n" "fmla v9.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1c8]\n" + "ldr x21, [x17, #0x1c8]\n" "fmla v17.8h, v7.8h, v2.h[6]\n" "fmla v21.8h, v7.8h, v3.h[6]\n" "fmla v25.8h, v7.8h, v4.h[6]\n" "fmla v29.8h, v7.8h, v5.h[6]\n" "ldr d7, [x17, #0x1b0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.8h, v6.8h, v0.h[6]\n" "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr x11, [x17, #0x1d8]\n" + "ldr x20, [x17, #0x1d8]\n" "fmla v18.8h, v6.8h, v2.h[6]\n" "fmla v22.8h, v6.8h, v3.h[6]\n" "fmla v26.8h, v6.8h, v4.h[6]\n" "fmla v30.8h, v6.8h, v5.h[6]\n" "ldr d6, [x17, #0x1c0]\n" "fmla v11.8h, v7.8h, v0.h[6]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr x12, [x17, #0x1e8]\n" + "ldr x21, [x17, #0x1e8]\n" "fmla v19.8h, v7.8h, v2.h[6]\n" "fmla v23.8h, v7.8h, v3.h[6]\n" "fmla v27.8h, v7.8h, v4.h[6]\n" "fmla v31.8h, v7.8h, v5.h[6]\n" "ldr d7, [x17, #0x1d0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.8h, v6.8h, v0.h[7]\n" "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x1f8]\n" + "ldr x20, [x17, #0x1f8]\n" "fmla v16.8h, v6.8h, v2.h[7]\n" "fmla v20.8h, v6.8h, v3.h[7]\n" "fmla v24.8h, v6.8h, v4.h[7]\n" "fmla v28.8h, v6.8h, v5.h[7]\n" "ldr d6, [x17, #0x1e0]\n" "fmla v9.8h, v7.8h, v0.h[7]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.8h, v7.8h, v1.h[7]\n" "fmla v17.8h, v7.8h, v2.h[7]\n" "fmla v21.8h, v7.8h, v3.h[7]\n" "fmla v25.8h, v7.8h, v4.h[7]\n" "fmla v29.8h, v7.8h, v5.h[7]\n" "ldr d7, [x17, #0x1f0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "add x17, x17, #0x200\n" "fmla v10.8h, v6.8h, v0.h[7]\n" - "ldr x12, [x17, #0x8]\n" + "ldr x21, [x17, #0x8]\n" "fmla v14.8h, v6.8h, v1.h[7]\n" - "ldr x11, [x17, #0x18]\n" + "ldr x20, [x17, #0x18]\n" "fmla v18.8h, v6.8h, v2.h[7]\n" "fmla v22.8h, v6.8h, v3.h[7]\n" "fmla v26.8h, v6.8h, v4.h[7]\n" @@ -5079,56 +5079,56 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "fmla v11.8h, v7.8h, v0.h[7]\n" "ldr d0, [x13, #0x0]\n" "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x12, #0x0]\n" "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x11, #0x0]\n" "fmla v23.8h, v7.8h, v3.h[7]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d3, [x10, #0x0]\n" "fmla v27.8h, v7.8h, v4.h[7]\n" - "ldr d4, [x23, #0x0]\n" + "ldr d4, [x9, #0x0]\n" "fmla v31.8h, v7.8h, v5.h[7]\n" - "ldr d5, [x21, #0x0]\n" + "ldr d5, [x28, #0x0]\n" "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x26\n" + "mov v2.d[1], x25\n" "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v5.d[1], x20\n" - "mov v7.d[1], x11\n" + "mov v4.d[1], x23\n" + "mov v5.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 271b\n" "272:" // Height 6: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "add x13, x13, #0x10\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v28.8h, v6.8h, v5.h[0]\n" "ldr q6, [x17, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "sub x14, x14, #0x8\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "prfm pldl1keep, [x13, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v29.8h, v7.8h, v5.h[0]\n" "ldr q7, [x17, #0x30]\n" "fmla v10.8h, v6.8h, v0.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v18.8h, v6.8h, v2.h[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v22.8h, v6.8h, v3.h[0]\n" "fmla v26.8h, v6.8h, v4.h[0]\n" "fmla v30.8h, v6.8h, v5.h[0]\n" @@ -5338,42 +5338,42 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "273:" // Height 6: Multiply loop: Main loop skip "cbz x14, 275f\n" "274:" // Height 6: Multiply loop: Odd block loop - "ldr h0, [x13], #0x2\n" + "ldr h7, [x13], #0x2\n" "sub x14, x14, #0x1\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" - "ldr h5, [x21], #0x2\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "fmla v28.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr h6, [x12], #0x2\n" + "ldr h5, [x11], #0x2\n" + "ldr h4, [x10], #0x2\n" + "ldr h3, [x9], #0x2\n" + "ldr h2, [x28], #0x2\n" + "ldr q1, [x17, #0x0]\n" + "fmla v8.8h, v1.8h, v7.h[0]\n" + "ldr q0, [x17, #0x10]\n" + "fmla v12.8h, v1.8h, v6.h[0]\n" + "fmla v16.8h, v1.8h, v5.h[0]\n" + "fmla v20.8h, v1.8h, v4.h[0]\n" + "fmla v24.8h, v1.8h, v3.h[0]\n" + "fmla v28.8h, v1.8h, v2.h[0]\n" + "ldr q1, [x17, #0x20]\n" + "fmla v9.8h, v0.8h, v7.h[0]\n" + "fmla v13.8h, v0.8h, v6.h[0]\n" + "fmla v17.8h, v0.8h, v5.h[0]\n" + "fmla v21.8h, v0.8h, v4.h[0]\n" + "fmla v25.8h, v0.8h, v3.h[0]\n" + "fmla v29.8h, v0.8h, v2.h[0]\n" + "ldr q0, [x17, #0x30]\n" + "fmla v10.8h, v1.8h, v7.h[0]\n" "add x17, x17, #0x40\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v30.8h, v6.8h, v5.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "fmla v31.8h, v7.8h, v5.h[0]\n" + "fmla v14.8h, v1.8h, v6.h[0]\n" + "fmla v18.8h, v1.8h, v5.h[0]\n" + "fmla v22.8h, v1.8h, v4.h[0]\n" + "fmla v26.8h, v1.8h, v3.h[0]\n" + "fmla v30.8h, v1.8h, v2.h[0]\n" + "fmla v11.8h, v0.8h, v7.h[0]\n" + "fmla v15.8h, v0.8h, v6.h[0]\n" + "fmla v19.8h, v0.8h, v5.h[0]\n" + "fmla v23.8h, v0.8h, v4.h[0]\n" + "fmla v27.8h, v0.8h, v3.h[0]\n" + "fmla v31.8h, v0.8h, v2.h[0]\n" "cbnz x14, 274b\n" "275:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -5743,7 +5743,6 @@ void a64_hybrid_fp16_mla_6x32_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "296:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp index 335308751f..8e5f600c83 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp @@ -244,11 +244,11 @@ void a64_hybrid_fp16_mla_6x32 ( "23:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 24f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -265,69 +265,69 @@ void a64_hybrid_fp16_mla_6x32 ( "blt 27f\n" "26:" // Height 1: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x10, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x10, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x10, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x10, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x10, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x10, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr q17, [x10, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr q16, [x10, #0x1f0]\n" "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x10\n" "add x10, x10, #0x200\n" @@ -337,84 +337,84 @@ void a64_hybrid_fp16_mla_6x32 ( "bge 26b\n" "27:" // Height 1: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x50]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "ldr q17, [x10, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "ldr q16, [x10, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x10, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "ldr q17, [x10, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "ldr q16, [x10, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x10, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x10, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x10, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "ldr q17, [x10, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "ldr q16, [x10, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "ldr q17, [x10, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "ldr q16, [x10, #0x1f0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x8\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x200\n" "28:" // Height 1: Multiply loop: Main loop skip "cbz x27, 30f\n" "29:" // Height 1: Multiply loop: Odd block loop "ldr h0, [x26], #0x2\n" - "ldr q6, [x10, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q16, [x10, #0x0]\n" + "fmla v8.8h, v16.8h, v0.h[0]\n" "sub x27, x27, #0x1\n" - "ldr q7, [x10, #0x10]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q17, [x10, #0x10]\n" + "ldr q16, [x10, #0x20]\n" + "fmla v9.8h, v17.8h, v0.h[0]\n" + "fmla v10.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" "add x10, x10, #0x40\n" "cbnz x27, 29b\n" "30:" // Height 1: Multiply loop: No odd multiplies @@ -425,17 +425,17 @@ void a64_hybrid_fp16_mla_6x32 ( "prfm pstl1keep, [x9, #0x0]\n" "tbz %x[flags], #1, 31f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v17.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" "31:" // Height 1: No activation "cmp x11, #0x20\n" "bge 48f\n" @@ -733,12 +733,12 @@ void a64_hybrid_fp16_mla_6x32 ( "72:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 74f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -746,7 +746,7 @@ void a64_hybrid_fp16_mla_6x32 ( "b 74f\n" "73:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "74:" // Height 2: input setup done "cmp x27, #0x8\n" "blt 77f\n" @@ -759,230 +759,230 @@ void a64_hybrid_fp16_mla_6x32 ( "75:" // Height 2: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "sub x27, x27, #0x8\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x26, x26, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr q17, [x10, #0x40]\n" "add x25, x25, #0x10\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr q16, [x10, #0x50]\n" "cmp x27, #0x10\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x70]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x10, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x10, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x10, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x10, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x10, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x10, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr q17, [x10, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr q16, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 75b\n" "76:" // Height 2: Multiply loop: Single iteration only "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "add x26, x26, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x25, x25, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v10.8h, v17.8h, v0.h[0]\n" + "fmla v14.8h, v17.8h, v1.h[0]\n" + "ldr q17, [x10, #0x40]\n" "sub x27, x27, #0x8\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v11.8h, v16.8h, v0.h[0]\n" + "fmla v15.8h, v16.8h, v1.h[0]\n" + "ldr q16, [x10, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x60]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "fmla v12.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v9.8h, v16.8h, v0.h[1]\n" + "fmla v13.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.8h, v17.8h, v0.h[1]\n" + "fmla v14.8h, v17.8h, v1.h[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.8h, v16.8h, v0.h[1]\n" + "fmla v15.8h, v16.8h, v1.h[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.8h, v17.8h, v0.h[2]\n" + "fmla v12.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.8h, v16.8h, v0.h[2]\n" + "fmla v13.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.8h, v17.8h, v0.h[2]\n" + "fmla v14.8h, v17.8h, v1.h[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.8h, v16.8h, v0.h[2]\n" + "fmla v15.8h, v16.8h, v1.h[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "fmla v12.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.8h, v16.8h, v0.h[3]\n" + "fmla v13.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0xf0]\n" + "fmla v10.8h, v17.8h, v0.h[3]\n" + "fmla v14.8h, v17.8h, v1.h[3]\n" + "ldr q17, [x10, #0x100]\n" + "fmla v11.8h, v16.8h, v0.h[3]\n" + "fmla v15.8h, v16.8h, v1.h[3]\n" + "ldr q16, [x10, #0x110]\n" + "fmla v8.8h, v17.8h, v0.h[4]\n" + "fmla v12.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x120]\n" + "fmla v9.8h, v16.8h, v0.h[4]\n" + "fmla v13.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x10, #0x130]\n" + "fmla v10.8h, v17.8h, v0.h[4]\n" + "fmla v14.8h, v17.8h, v1.h[4]\n" + "ldr q17, [x10, #0x140]\n" + "fmla v11.8h, v16.8h, v0.h[4]\n" + "fmla v15.8h, v16.8h, v1.h[4]\n" + "ldr q16, [x10, #0x150]\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v12.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x160]\n" + "fmla v9.8h, v16.8h, v0.h[5]\n" + "fmla v13.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x10, #0x170]\n" + "fmla v10.8h, v17.8h, v0.h[5]\n" + "fmla v14.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0x180]\n" + "fmla v11.8h, v16.8h, v0.h[5]\n" + "fmla v15.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x10, #0x190]\n" + "fmla v8.8h, v17.8h, v0.h[6]\n" + "fmla v12.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x1a0]\n" + "fmla v9.8h, v16.8h, v0.h[6]\n" + "fmla v13.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x10, #0x1b0]\n" + "fmla v10.8h, v17.8h, v0.h[6]\n" + "fmla v14.8h, v17.8h, v1.h[6]\n" + "ldr q17, [x10, #0x1c0]\n" + "fmla v11.8h, v16.8h, v0.h[6]\n" + "fmla v15.8h, v16.8h, v1.h[6]\n" + "ldr q16, [x10, #0x1d0]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v12.8h, v17.8h, v1.h[7]\n" + "ldr q17, [x10, #0x1e0]\n" + "fmla v9.8h, v16.8h, v0.h[7]\n" + "fmla v13.8h, v16.8h, v1.h[7]\n" + "ldr q16, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v0.h[7]\n" + "fmla v14.8h, v17.8h, v1.h[7]\n" + "fmla v11.8h, v16.8h, v0.h[7]\n" + "fmla v15.8h, v16.8h, v1.h[7]\n" "77:" // Height 2: Multiply loop: Main loop skip "cbz x27, 79f\n" "78:" // Height 2: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + "fmla v8.8h, v17.8h, v1.h[0]\n" + "fmla v12.8h, v17.8h, v0.h[0]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.8h, v16.8h, v1.h[0]\n" + "fmla v13.8h, v16.8h, v0.h[0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.8h, v17.8h, v1.h[0]\n" + "fmla v14.8h, v17.8h, v0.h[0]\n" "add x10, x10, #0x40\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v11.8h, v16.8h, v1.h[0]\n" + "fmla v15.8h, v16.8h, v0.h[0]\n" "cbnz x27, 78b\n" "79:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -995,25 +995,25 @@ void a64_hybrid_fp16_mla_6x32 ( "prfm pstl1keep, [x25, #0x0]\n" "tbz %x[flags], #1, 80f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v17.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmin v12.8h, v12.8h, v17.8h\n" + "fmin v13.8h, v13.8h, v17.8h\n" + "fmin v14.8h, v14.8h, v17.8h\n" + "fmin v15.8h, v15.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" + "fmax v12.8h, v12.8h, v16.8h\n" + "fmax v13.8h, v13.8h, v16.8h\n" + "fmax v14.8h, v14.8h, v16.8h\n" + "fmax v15.8h, v15.8h, v16.8h\n" "80:" // Height 2: No activation "cmp x11, #0x20\n" "bge 97f\n" @@ -1392,13 +1392,13 @@ void a64_hybrid_fp16_mla_6x32 ( "121:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 122f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 123f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1407,8 +1407,8 @@ void a64_hybrid_fp16_mla_6x32 ( "b 123f\n" "122:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "123:" // Height 3: input setup done "cmp x27, #0x8\n" "blt 126f\n" @@ -1425,139 +1425,139 @@ void a64_hybrid_fp16_mla_6x32 ( "sub x27, x27, #0x8\n" "add x26, x26, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "add x25, x25, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "add x24, x24, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" "cmp x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr q20, [x10, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x70]\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x80]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x90]\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xa0]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xb0]\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xc0]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xd0]\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0xe0]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0xf0]\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0x100]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0x110]\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x120]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x10, #0x130]\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x140]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x10, #0x150]\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x160]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x10, #0x170]\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x180]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x10, #0x190]\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x1a0]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x10, #0x1b0]\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x1c0]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x10, #0x1d0]\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr q21, [x10, #0x1e0]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr q20, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 124b\n" @@ -1567,159 +1567,159 @@ void a64_hybrid_fp16_mla_6x32 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "add x24, x24, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "sub x27, x27, #0x8\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v0.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v18.8h, v21.8h, v2.h[0]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v11.8h, v20.8h, v0.h[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v2.h[0]\n" + "ldr q20, [x10, #0x50]\n" + "fmla v8.8h, v21.8h, v0.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[1]\n" + "fmla v16.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.8h, v20.8h, v0.h[1]\n" + "fmla v13.8h, v20.8h, v1.h[1]\n" + "fmla v17.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x70]\n" + "fmla v10.8h, v21.8h, v0.h[1]\n" + "fmla v14.8h, v21.8h, v1.h[1]\n" + "fmla v18.8h, v21.8h, v2.h[1]\n" + "ldr q21, [x10, #0x80]\n" + "fmla v11.8h, v20.8h, v0.h[1]\n" + "fmla v15.8h, v20.8h, v1.h[1]\n" + "fmla v19.8h, v20.8h, v2.h[1]\n" + "ldr q20, [x10, #0x90]\n" + "fmla v8.8h, v21.8h, v0.h[2]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v16.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xa0]\n" + "fmla v9.8h, v20.8h, v0.h[2]\n" + "fmla v13.8h, v20.8h, v1.h[2]\n" + "fmla v17.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xb0]\n" + "fmla v10.8h, v21.8h, v0.h[2]\n" + "fmla v14.8h, v21.8h, v1.h[2]\n" + "fmla v18.8h, v21.8h, v2.h[2]\n" + "ldr q21, [x10, #0xc0]\n" + "fmla v11.8h, v20.8h, v0.h[2]\n" + "fmla v15.8h, v20.8h, v1.h[2]\n" + "fmla v19.8h, v20.8h, v2.h[2]\n" + "ldr q20, [x10, #0xd0]\n" + "fmla v8.8h, v21.8h, v0.h[3]\n" + "fmla v12.8h, v21.8h, v1.h[3]\n" + "fmla v16.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0xe0]\n" + "fmla v9.8h, v20.8h, v0.h[3]\n" + "fmla v13.8h, v20.8h, v1.h[3]\n" + "fmla v17.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0xf0]\n" + "fmla v10.8h, v21.8h, v0.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[3]\n" + "fmla v18.8h, v21.8h, v2.h[3]\n" + "ldr q21, [x10, #0x100]\n" + "fmla v11.8h, v20.8h, v0.h[3]\n" + "fmla v15.8h, v20.8h, v1.h[3]\n" + "fmla v19.8h, v20.8h, v2.h[3]\n" + "ldr q20, [x10, #0x110]\n" + "fmla v8.8h, v21.8h, v0.h[4]\n" + "fmla v12.8h, v21.8h, v1.h[4]\n" + "fmla v16.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x120]\n" + "fmla v9.8h, v20.8h, v0.h[4]\n" + "fmla v13.8h, v20.8h, v1.h[4]\n" + "fmla v17.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x10, #0x130]\n" + "fmla v10.8h, v21.8h, v0.h[4]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v18.8h, v21.8h, v2.h[4]\n" + "ldr q21, [x10, #0x140]\n" + "fmla v11.8h, v20.8h, v0.h[4]\n" + "fmla v15.8h, v20.8h, v1.h[4]\n" + "fmla v19.8h, v20.8h, v2.h[4]\n" + "ldr q20, [x10, #0x150]\n" + "fmla v8.8h, v21.8h, v0.h[5]\n" + "fmla v12.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x160]\n" + "fmla v9.8h, v20.8h, v0.h[5]\n" + "fmla v13.8h, v20.8h, v1.h[5]\n" + "fmla v17.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x10, #0x170]\n" + "fmla v10.8h, v21.8h, v0.h[5]\n" + "fmla v14.8h, v21.8h, v1.h[5]\n" + "fmla v18.8h, v21.8h, v2.h[5]\n" + "ldr q21, [x10, #0x180]\n" + "fmla v11.8h, v20.8h, v0.h[5]\n" + "fmla v15.8h, v20.8h, v1.h[5]\n" + "fmla v19.8h, v20.8h, v2.h[5]\n" + "ldr q20, [x10, #0x190]\n" + "fmla v8.8h, v21.8h, v0.h[6]\n" + "fmla v12.8h, v21.8h, v1.h[6]\n" + "fmla v16.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x1a0]\n" + "fmla v9.8h, v20.8h, v0.h[6]\n" + "fmla v13.8h, v20.8h, v1.h[6]\n" + "fmla v17.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x10, #0x1b0]\n" + "fmla v10.8h, v21.8h, v0.h[6]\n" + "fmla v14.8h, v21.8h, v1.h[6]\n" + "fmla v18.8h, v21.8h, v2.h[6]\n" + "ldr q21, [x10, #0x1c0]\n" + "fmla v11.8h, v20.8h, v0.h[6]\n" + "fmla v15.8h, v20.8h, v1.h[6]\n" + "fmla v19.8h, v20.8h, v2.h[6]\n" + "ldr q20, [x10, #0x1d0]\n" + "fmla v8.8h, v21.8h, v0.h[7]\n" + "fmla v12.8h, v21.8h, v1.h[7]\n" + "fmla v16.8h, v21.8h, v2.h[7]\n" + "ldr q21, [x10, #0x1e0]\n" + "fmla v9.8h, v20.8h, v0.h[7]\n" + "fmla v13.8h, v20.8h, v1.h[7]\n" + "fmla v17.8h, v20.8h, v2.h[7]\n" + "ldr q20, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v10.8h, v21.8h, v0.h[7]\n" + "fmla v14.8h, v21.8h, v1.h[7]\n" + "fmla v18.8h, v21.8h, v2.h[7]\n" + "fmla v11.8h, v20.8h, v0.h[7]\n" + "fmla v15.8h, v20.8h, v1.h[7]\n" + "fmla v19.8h, v20.8h, v2.h[7]\n" "126:" // Height 3: Multiply loop: Main loop skip "cbz x27, 128f\n" "127:" // Height 3: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" + "ldr h2, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr q6, [x10, #0x0]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr h0, [x24], #0x2\n" + "ldr q21, [x10, #0x0]\n" + "fmla v8.8h, v21.8h, v2.h[0]\n" + "fmla v12.8h, v21.8h, v1.h[0]\n" + "ldr q20, [x10, #0x10]\n" + "fmla v16.8h, v21.8h, v0.h[0]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.8h, v20.8h, v2.h[0]\n" + "fmla v13.8h, v20.8h, v1.h[0]\n" + "fmla v17.8h, v20.8h, v0.h[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v10.8h, v21.8h, v2.h[0]\n" + "fmla v14.8h, v21.8h, v1.h[0]\n" + "fmla v18.8h, v21.8h, v0.h[0]\n" + "fmla v11.8h, v20.8h, v2.h[0]\n" + "fmla v15.8h, v20.8h, v1.h[0]\n" + "fmla v19.8h, v20.8h, v0.h[0]\n" "cbnz x27, 127b\n" "128:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1734,33 +1734,33 @@ void a64_hybrid_fp16_mla_6x32 ( "prfm pstl1keep, [x24, #0x0]\n" "tbz %x[flags], #1, 129f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v21.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" + "ld1r { v20.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v21.8h\n" + "fmin v9.8h, v9.8h, v21.8h\n" + "fmin v10.8h, v10.8h, v21.8h\n" + "fmin v11.8h, v11.8h, v21.8h\n" + "fmin v12.8h, v12.8h, v21.8h\n" + "fmin v13.8h, v13.8h, v21.8h\n" + "fmin v14.8h, v14.8h, v21.8h\n" + "fmin v15.8h, v15.8h, v21.8h\n" + "fmin v16.8h, v16.8h, v21.8h\n" + "fmin v17.8h, v17.8h, v21.8h\n" + "fmin v18.8h, v18.8h, v21.8h\n" + "fmin v19.8h, v19.8h, v21.8h\n" + "fmax v8.8h, v8.8h, v20.8h\n" + "fmax v9.8h, v9.8h, v20.8h\n" + "fmax v10.8h, v10.8h, v20.8h\n" + "fmax v11.8h, v11.8h, v20.8h\n" + "fmax v12.8h, v12.8h, v20.8h\n" + "fmax v13.8h, v13.8h, v20.8h\n" + "fmax v14.8h, v14.8h, v20.8h\n" + "fmax v15.8h, v15.8h, v20.8h\n" + "fmax v16.8h, v16.8h, v20.8h\n" + "fmax v17.8h, v17.8h, v20.8h\n" + "fmax v18.8h, v18.8h, v20.8h\n" + "fmax v19.8h, v19.8h, v20.8h\n" "129:" // Height 3: No activation "cmp x11, #0x20\n" "bge 146f\n" @@ -2220,14 +2220,14 @@ void a64_hybrid_fp16_mla_6x32 ( "170:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 171f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 172f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2237,9 +2237,9 @@ void a64_hybrid_fp16_mla_6x32 ( "b 172f\n" "171:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "172:" // Height 4: input setup done "cmp x27, #0x8\n" "blt 175f\n" @@ -2258,7 +2258,7 @@ void a64_hybrid_fp16_mla_6x32 ( "add x26, x26, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x25, x25, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" @@ -2266,165 +2266,165 @@ void a64_hybrid_fp16_mla_6x32 ( "add x23, x23, #0x10\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "cmp x27, #0x10\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr q24, [x10, #0x50]\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x70]\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x80]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x90]\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xa0]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xb0]\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xc0]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xd0]\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0xe0]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0xf0]\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0x100]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0x110]\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x120]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x10, #0x130]\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x140]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x10, #0x150]\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x160]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x10, #0x170]\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x180]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x10, #0x190]\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x1a0]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x10, #0x1b0]\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x1c0]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x10, #0x1d0]\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr q25, [x10, #0x1e0]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr q24, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 173b\n" @@ -2435,7 +2435,7 @@ void a64_hybrid_fp16_mla_6x32 ( "add x25, x25, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x24, x24, #0x10\n" "fmla v9.8h, v7.8h, v0.h[0]\n" "fmla v13.8h, v7.8h, v1.h[0]\n" @@ -2443,189 +2443,189 @@ void a64_hybrid_fp16_mla_6x32 ( "sub x27, x27, #0x8\n" "fmla v17.8h, v7.8h, v2.h[0]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v10.8h, v25.8h, v0.h[0]\n" + "fmla v14.8h, v25.8h, v1.h[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v18.8h, v25.8h, v2.h[0]\n" + "fmla v22.8h, v25.8h, v3.h[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" + "fmla v11.8h, v24.8h, v0.h[0]\n" + "fmla v15.8h, v24.8h, v1.h[0]\n" + "fmla v19.8h, v24.8h, v2.h[0]\n" + "fmla v23.8h, v24.8h, v3.h[0]\n" + "ldr q24, [x10, #0x50]\n" + "fmla v8.8h, v25.8h, v0.h[1]\n" + "fmla v12.8h, v25.8h, v1.h[1]\n" + "fmla v16.8h, v25.8h, v2.h[1]\n" + "fmla v20.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.8h, v24.8h, v0.h[1]\n" + "fmla v13.8h, v24.8h, v1.h[1]\n" + "fmla v17.8h, v24.8h, v2.h[1]\n" + "fmla v21.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x70]\n" + "fmla v10.8h, v25.8h, v0.h[1]\n" + "fmla v14.8h, v25.8h, v1.h[1]\n" + "fmla v18.8h, v25.8h, v2.h[1]\n" + "fmla v22.8h, v25.8h, v3.h[1]\n" + "ldr q25, [x10, #0x80]\n" + "fmla v11.8h, v24.8h, v0.h[1]\n" + "fmla v15.8h, v24.8h, v1.h[1]\n" + "fmla v19.8h, v24.8h, v2.h[1]\n" + "fmla v23.8h, v24.8h, v3.h[1]\n" + "ldr q24, [x10, #0x90]\n" + "fmla v8.8h, v25.8h, v0.h[2]\n" + "fmla v12.8h, v25.8h, v1.h[2]\n" + "fmla v16.8h, v25.8h, v2.h[2]\n" + "fmla v20.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xa0]\n" + "fmla v9.8h, v24.8h, v0.h[2]\n" + "fmla v13.8h, v24.8h, v1.h[2]\n" + "fmla v17.8h, v24.8h, v2.h[2]\n" + "fmla v21.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xb0]\n" + "fmla v10.8h, v25.8h, v0.h[2]\n" + "fmla v14.8h, v25.8h, v1.h[2]\n" + "fmla v18.8h, v25.8h, v2.h[2]\n" + "fmla v22.8h, v25.8h, v3.h[2]\n" + "ldr q25, [x10, #0xc0]\n" + "fmla v11.8h, v24.8h, v0.h[2]\n" + "fmla v15.8h, v24.8h, v1.h[2]\n" + "fmla v19.8h, v24.8h, v2.h[2]\n" + "fmla v23.8h, v24.8h, v3.h[2]\n" + "ldr q24, [x10, #0xd0]\n" + "fmla v8.8h, v25.8h, v0.h[3]\n" + "fmla v12.8h, v25.8h, v1.h[3]\n" + "fmla v16.8h, v25.8h, v2.h[3]\n" + "fmla v20.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0xe0]\n" + "fmla v9.8h, v24.8h, v0.h[3]\n" + "fmla v13.8h, v24.8h, v1.h[3]\n" + "fmla v17.8h, v24.8h, v2.h[3]\n" + "fmla v21.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0xf0]\n" + "fmla v10.8h, v25.8h, v0.h[3]\n" + "fmla v14.8h, v25.8h, v1.h[3]\n" + "fmla v18.8h, v25.8h, v2.h[3]\n" + "fmla v22.8h, v25.8h, v3.h[3]\n" + "ldr q25, [x10, #0x100]\n" + "fmla v11.8h, v24.8h, v0.h[3]\n" + "fmla v15.8h, v24.8h, v1.h[3]\n" + "fmla v19.8h, v24.8h, v2.h[3]\n" + "fmla v23.8h, v24.8h, v3.h[3]\n" + "ldr q24, [x10, #0x110]\n" + "fmla v8.8h, v25.8h, v0.h[4]\n" + "fmla v12.8h, v25.8h, v1.h[4]\n" + "fmla v16.8h, v25.8h, v2.h[4]\n" + "fmla v20.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x120]\n" + "fmla v9.8h, v24.8h, v0.h[4]\n" + "fmla v13.8h, v24.8h, v1.h[4]\n" + "fmla v17.8h, v24.8h, v2.h[4]\n" + "fmla v21.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x10, #0x130]\n" + "fmla v10.8h, v25.8h, v0.h[4]\n" + "fmla v14.8h, v25.8h, v1.h[4]\n" + "fmla v18.8h, v25.8h, v2.h[4]\n" + "fmla v22.8h, v25.8h, v3.h[4]\n" + "ldr q25, [x10, #0x140]\n" + "fmla v11.8h, v24.8h, v0.h[4]\n" + "fmla v15.8h, v24.8h, v1.h[4]\n" + "fmla v19.8h, v24.8h, v2.h[4]\n" + "fmla v23.8h, v24.8h, v3.h[4]\n" + "ldr q24, [x10, #0x150]\n" + "fmla v8.8h, v25.8h, v0.h[5]\n" + "fmla v12.8h, v25.8h, v1.h[5]\n" + "fmla v16.8h, v25.8h, v2.h[5]\n" + "fmla v20.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x160]\n" + "fmla v9.8h, v24.8h, v0.h[5]\n" + "fmla v13.8h, v24.8h, v1.h[5]\n" + "fmla v17.8h, v24.8h, v2.h[5]\n" + "fmla v21.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x10, #0x170]\n" + "fmla v10.8h, v25.8h, v0.h[5]\n" + "fmla v14.8h, v25.8h, v1.h[5]\n" + "fmla v18.8h, v25.8h, v2.h[5]\n" + "fmla v22.8h, v25.8h, v3.h[5]\n" + "ldr q25, [x10, #0x180]\n" + "fmla v11.8h, v24.8h, v0.h[5]\n" + "fmla v15.8h, v24.8h, v1.h[5]\n" + "fmla v19.8h, v24.8h, v2.h[5]\n" + "fmla v23.8h, v24.8h, v3.h[5]\n" + "ldr q24, [x10, #0x190]\n" + "fmla v8.8h, v25.8h, v0.h[6]\n" + "fmla v12.8h, v25.8h, v1.h[6]\n" + "fmla v16.8h, v25.8h, v2.h[6]\n" + "fmla v20.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x1a0]\n" + "fmla v9.8h, v24.8h, v0.h[6]\n" + "fmla v13.8h, v24.8h, v1.h[6]\n" + "fmla v17.8h, v24.8h, v2.h[6]\n" + "fmla v21.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x10, #0x1b0]\n" + "fmla v10.8h, v25.8h, v0.h[6]\n" + "fmla v14.8h, v25.8h, v1.h[6]\n" + "fmla v18.8h, v25.8h, v2.h[6]\n" + "fmla v22.8h, v25.8h, v3.h[6]\n" + "ldr q25, [x10, #0x1c0]\n" + "fmla v11.8h, v24.8h, v0.h[6]\n" + "fmla v15.8h, v24.8h, v1.h[6]\n" + "fmla v19.8h, v24.8h, v2.h[6]\n" + "fmla v23.8h, v24.8h, v3.h[6]\n" + "ldr q24, [x10, #0x1d0]\n" + "fmla v8.8h, v25.8h, v0.h[7]\n" + "fmla v12.8h, v25.8h, v1.h[7]\n" + "fmla v16.8h, v25.8h, v2.h[7]\n" + "fmla v20.8h, v25.8h, v3.h[7]\n" + "ldr q25, [x10, #0x1e0]\n" + "fmla v9.8h, v24.8h, v0.h[7]\n" + "fmla v13.8h, v24.8h, v1.h[7]\n" + "fmla v17.8h, v24.8h, v2.h[7]\n" + "fmla v21.8h, v24.8h, v3.h[7]\n" + "ldr q24, [x10, #0x1f0]\n" "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v10.8h, v25.8h, v0.h[7]\n" + "fmla v14.8h, v25.8h, v1.h[7]\n" + "fmla v18.8h, v25.8h, v2.h[7]\n" + "fmla v22.8h, v25.8h, v3.h[7]\n" + "fmla v11.8h, v24.8h, v0.h[7]\n" + "fmla v15.8h, v24.8h, v1.h[7]\n" + "fmla v19.8h, v24.8h, v2.h[7]\n" + "fmla v23.8h, v24.8h, v3.h[7]\n" "175:" // Height 4: Multiply loop: Main loop skip "cbz x27, 177f\n" "176:" // Height 4: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h3, [x26], #0x2\n" + "ldr h2, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr h1, [x24], #0x2\n" + "ldr h0, [x23], #0x2\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + "fmla v8.8h, v25.8h, v3.h[0]\n" + "fmla v12.8h, v25.8h, v2.h[0]\n" + "fmla v16.8h, v25.8h, v1.h[0]\n" + "fmla v20.8h, v25.8h, v0.h[0]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.8h, v24.8h, v3.h[0]\n" + "fmla v13.8h, v24.8h, v2.h[0]\n" + "fmla v17.8h, v24.8h, v1.h[0]\n" + "fmla v21.8h, v24.8h, v0.h[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v10.8h, v25.8h, v3.h[0]\n" + "fmla v14.8h, v25.8h, v2.h[0]\n" + "fmla v18.8h, v25.8h, v1.h[0]\n" + "fmla v22.8h, v25.8h, v0.h[0]\n" + "fmla v11.8h, v24.8h, v3.h[0]\n" + "fmla v15.8h, v24.8h, v2.h[0]\n" + "fmla v19.8h, v24.8h, v1.h[0]\n" + "fmla v23.8h, v24.8h, v0.h[0]\n" "cbnz x27, 176b\n" "177:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2642,41 +2642,41 @@ void a64_hybrid_fp16_mla_6x32 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 178f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v25.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" + "ld1r { v24.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v25.8h\n" + "fmin v9.8h, v9.8h, v25.8h\n" + "fmin v10.8h, v10.8h, v25.8h\n" + "fmin v11.8h, v11.8h, v25.8h\n" + "fmin v12.8h, v12.8h, v25.8h\n" + "fmin v13.8h, v13.8h, v25.8h\n" + "fmin v14.8h, v14.8h, v25.8h\n" + "fmin v15.8h, v15.8h, v25.8h\n" + "fmin v16.8h, v16.8h, v25.8h\n" + "fmin v17.8h, v17.8h, v25.8h\n" + "fmin v18.8h, v18.8h, v25.8h\n" + "fmin v19.8h, v19.8h, v25.8h\n" + "fmin v20.8h, v20.8h, v25.8h\n" + "fmin v21.8h, v21.8h, v25.8h\n" + "fmin v22.8h, v22.8h, v25.8h\n" + "fmin v23.8h, v23.8h, v25.8h\n" + "fmax v8.8h, v8.8h, v24.8h\n" + "fmax v9.8h, v9.8h, v24.8h\n" + "fmax v10.8h, v10.8h, v24.8h\n" + "fmax v11.8h, v11.8h, v24.8h\n" + "fmax v12.8h, v12.8h, v24.8h\n" + "fmax v13.8h, v13.8h, v24.8h\n" + "fmax v14.8h, v14.8h, v24.8h\n" + "fmax v15.8h, v15.8h, v24.8h\n" + "fmax v16.8h, v16.8h, v24.8h\n" + "fmax v17.8h, v17.8h, v24.8h\n" + "fmax v18.8h, v18.8h, v24.8h\n" + "fmax v19.8h, v19.8h, v24.8h\n" + "fmax v20.8h, v20.8h, v24.8h\n" + "fmax v21.8h, v21.8h, v24.8h\n" + "fmax v22.8h, v22.8h, v24.8h\n" + "fmax v23.8h, v23.8h, v24.8h\n" "178:" // Height 4: No activation "cmp x11, #0x20\n" "bge 195f\n" @@ -3124,583 +3124,583 @@ void a64_hybrid_fp16_mla_6x32 ( "tbz x11, #1, 212f\n" "ld1 { v8.s }[2], [x9], #0x4\n" "ld1 { v12.s }[2], [x25], #0x4\n" - "mov x20, #0xc\n" - "ld1 { v16.s }[2], [x24], #0x4\n" - "ld1 { v20.s }[2], [x23], #0x4\n" - "ld1 { v24.s }[2], [x22], #0x4\n" - "tbz x11, #0, 215f\n" - "ld1 { v8.h }[6], [x9]\n" - "ld1 { v12.h }[6], [x25]\n" - "ld1 { v16.h }[6], [x24]\n" - "ld1 { v20.h }[6], [x23]\n" - "ld1 { v24.h }[6], [x22]\n" - "b 215f\n" - "212:" // Height 5: Partial accumulate: partial_1_4 - "mov x20, #0x8\n" - "tbz x11, #0, 215f\n" - "ld1 { v8.h }[4], [x9]\n" - "ld1 { v12.h }[4], [x25]\n" - "ld1 { v16.h }[4], [x24]\n" - "ld1 { v20.h }[4], [x23]\n" - "ld1 { v24.h }[4], [x22]\n" - "b 215f\n" - "213:" // Height 5: Partial accumulate: partial_2_0 - "tbz x11, #1, 214f\n" - "ldr s8, [x9], #0x4\n" - "ldr s12, [x25], #0x4\n" - "mov x20, #0x4\n" - "ldr s16, [x24], #0x4\n" - "ldr s20, [x23], #0x4\n" - "ldr s24, [x22], #0x4\n" - "tbz x11, #0, 215f\n" - "ld1 { v8.h }[2], [x9]\n" - "ld1 { v12.h }[2], [x25]\n" - "ld1 { v16.h }[2], [x24]\n" - "ld1 { v20.h }[2], [x23]\n" - "ld1 { v24.h }[2], [x22]\n" - "b 215f\n" - "214:" // Height 5: Partial accumulate: partial_1_0 - "ldr h8, [x9, #0x0]\n" - "ldr h12, [x25, #0x0]\n" - "mov x20, #0x0\n" - "ldr h16, [x24, #0x0]\n" - "ldr h20, [x23, #0x0]\n" - "ldr h24, [x22, #0x0]\n" - "215:" // Height 5: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 218f\n" - "216:" // Height 5: full accumulate - "ldr q8, [x9, #0x0]\n" - "ldr q9, [x9, #0x10]\n" - "ldr q10, [x9, #0x20]\n" - "ldr q11, [x9, #0x30]\n" - "ldr q12, [x25, #0x0]\n" - "ldr q13, [x25, #0x10]\n" - "ldr q14, [x25, #0x20]\n" - "ldr q15, [x25, #0x30]\n" - "ldr q16, [x24, #0x0]\n" - "ldr q17, [x24, #0x10]\n" - "ldr q18, [x24, #0x20]\n" - "ldr q19, [x24, #0x30]\n" - "ldr q20, [x23, #0x0]\n" - "ldr q21, [x23, #0x10]\n" - "ldr q22, [x23, #0x20]\n" - "ldr q23, [x23, #0x30]\n" - "ldr q24, [x22, #0x0]\n" - "ldr q25, [x22, #0x10]\n" - "ldr q26, [x22, #0x20]\n" - "ldr q27, [x22, #0x30]\n" - "b 218f\n" - "217:" // Height 5: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "218:" // Height 5: setup done - "mov x28, #0x0\n" - "219:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 220f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 221f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "add x22, x22, x20, LSL #1\n" - "b 221f\n" - "220:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "221:" // Height 5: input setup done - "cmp x27, #0x8\n" - "blt 224f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "blt 223f\n" - "222:" // Height 5: Multiply loop: Main loop head - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x23, x23, #0x10\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x10\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" - "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "ldr q2, [x24, #0x0]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "ldr q3, [x23, #0x0]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "bge 222b\n" - "223:" // Height 5: Multiply loop: Single iteration only - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "add x22, x22, #0x10\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.8h, v6.8h, v0.h[1]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v16.8h, v6.8h, v2.h[1]\n" - "fmla v20.8h, v6.8h, v3.h[1]\n" - "fmla v24.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.8h, v7.8h, v0.h[1]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v17.8h, v7.8h, v2.h[1]\n" - "fmla v21.8h, v7.8h, v3.h[1]\n" - "fmla v25.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.8h, v6.8h, v0.h[1]\n" - "fmla v14.8h, v6.8h, v1.h[1]\n" - "fmla v18.8h, v6.8h, v2.h[1]\n" - "fmla v22.8h, v6.8h, v3.h[1]\n" - "fmla v26.8h, v6.8h, v4.h[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.8h, v7.8h, v0.h[1]\n" - "fmla v15.8h, v7.8h, v1.h[1]\n" - "fmla v19.8h, v7.8h, v2.h[1]\n" - "fmla v23.8h, v7.8h, v3.h[1]\n" - "fmla v27.8h, v7.8h, v4.h[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.8h, v6.8h, v0.h[2]\n" - "fmla v12.8h, v6.8h, v1.h[2]\n" - "fmla v16.8h, v6.8h, v2.h[2]\n" - "fmla v20.8h, v6.8h, v3.h[2]\n" - "fmla v24.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.8h, v7.8h, v0.h[2]\n" - "fmla v13.8h, v7.8h, v1.h[2]\n" - "fmla v17.8h, v7.8h, v2.h[2]\n" - "fmla v21.8h, v7.8h, v3.h[2]\n" - "fmla v25.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.8h, v6.8h, v0.h[2]\n" - "fmla v14.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v2.h[2]\n" - "fmla v22.8h, v6.8h, v3.h[2]\n" - "fmla v26.8h, v6.8h, v4.h[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.8h, v7.8h, v0.h[2]\n" - "fmla v15.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v2.h[2]\n" - "fmla v23.8h, v7.8h, v3.h[2]\n" - "fmla v27.8h, v7.8h, v4.h[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.8h, v6.8h, v0.h[3]\n" - "fmla v12.8h, v6.8h, v1.h[3]\n" - "fmla v16.8h, v6.8h, v2.h[3]\n" - "fmla v20.8h, v6.8h, v3.h[3]\n" - "fmla v24.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.8h, v7.8h, v0.h[3]\n" - "fmla v13.8h, v7.8h, v1.h[3]\n" - "fmla v17.8h, v7.8h, v2.h[3]\n" - "fmla v21.8h, v7.8h, v3.h[3]\n" - "fmla v25.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0xf0]\n" - "fmla v10.8h, v6.8h, v0.h[3]\n" - "fmla v14.8h, v6.8h, v1.h[3]\n" - "fmla v18.8h, v6.8h, v2.h[3]\n" - "fmla v22.8h, v6.8h, v3.h[3]\n" - "fmla v26.8h, v6.8h, v4.h[3]\n" - "ldr q6, [x10, #0x100]\n" - "fmla v11.8h, v7.8h, v0.h[3]\n" - "fmla v15.8h, v7.8h, v1.h[3]\n" - "fmla v19.8h, v7.8h, v2.h[3]\n" - "fmla v23.8h, v7.8h, v3.h[3]\n" - "fmla v27.8h, v7.8h, v4.h[3]\n" - "ldr q7, [x10, #0x110]\n" - "fmla v8.8h, v6.8h, v0.h[4]\n" - "fmla v12.8h, v6.8h, v1.h[4]\n" - "fmla v16.8h, v6.8h, v2.h[4]\n" - "fmla v20.8h, v6.8h, v3.h[4]\n" - "fmla v24.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x120]\n" - "fmla v9.8h, v7.8h, v0.h[4]\n" - "fmla v13.8h, v7.8h, v1.h[4]\n" - "fmla v17.8h, v7.8h, v2.h[4]\n" - "fmla v21.8h, v7.8h, v3.h[4]\n" - "fmla v25.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x10, #0x130]\n" - "fmla v10.8h, v6.8h, v0.h[4]\n" - "fmla v14.8h, v6.8h, v1.h[4]\n" - "fmla v18.8h, v6.8h, v2.h[4]\n" - "fmla v22.8h, v6.8h, v3.h[4]\n" - "fmla v26.8h, v6.8h, v4.h[4]\n" - "ldr q6, [x10, #0x140]\n" - "fmla v11.8h, v7.8h, v0.h[4]\n" - "fmla v15.8h, v7.8h, v1.h[4]\n" - "fmla v19.8h, v7.8h, v2.h[4]\n" - "fmla v23.8h, v7.8h, v3.h[4]\n" - "fmla v27.8h, v7.8h, v4.h[4]\n" - "ldr q7, [x10, #0x150]\n" - "fmla v8.8h, v6.8h, v0.h[5]\n" - "fmla v12.8h, v6.8h, v1.h[5]\n" - "fmla v16.8h, v6.8h, v2.h[5]\n" - "fmla v20.8h, v6.8h, v3.h[5]\n" - "fmla v24.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x160]\n" - "fmla v9.8h, v7.8h, v0.h[5]\n" - "fmla v13.8h, v7.8h, v1.h[5]\n" - "fmla v17.8h, v7.8h, v2.h[5]\n" - "fmla v21.8h, v7.8h, v3.h[5]\n" - "fmla v25.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x10, #0x170]\n" - "fmla v10.8h, v6.8h, v0.h[5]\n" - "fmla v14.8h, v6.8h, v1.h[5]\n" - "fmla v18.8h, v6.8h, v2.h[5]\n" - "fmla v22.8h, v6.8h, v3.h[5]\n" - "fmla v26.8h, v6.8h, v4.h[5]\n" - "ldr q6, [x10, #0x180]\n" - "fmla v11.8h, v7.8h, v0.h[5]\n" - "fmla v15.8h, v7.8h, v1.h[5]\n" - "fmla v19.8h, v7.8h, v2.h[5]\n" - "fmla v23.8h, v7.8h, v3.h[5]\n" - "fmla v27.8h, v7.8h, v4.h[5]\n" - "ldr q7, [x10, #0x190]\n" - "fmla v8.8h, v6.8h, v0.h[6]\n" - "fmla v12.8h, v6.8h, v1.h[6]\n" - "fmla v16.8h, v6.8h, v2.h[6]\n" - "fmla v20.8h, v6.8h, v3.h[6]\n" - "fmla v24.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x1a0]\n" - "fmla v9.8h, v7.8h, v0.h[6]\n" - "fmla v13.8h, v7.8h, v1.h[6]\n" - "fmla v17.8h, v7.8h, v2.h[6]\n" - "fmla v21.8h, v7.8h, v3.h[6]\n" - "fmla v25.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x10, #0x1b0]\n" - "fmla v10.8h, v6.8h, v0.h[6]\n" - "fmla v14.8h, v6.8h, v1.h[6]\n" - "fmla v18.8h, v6.8h, v2.h[6]\n" - "fmla v22.8h, v6.8h, v3.h[6]\n" - "fmla v26.8h, v6.8h, v4.h[6]\n" - "ldr q6, [x10, #0x1c0]\n" - "fmla v11.8h, v7.8h, v0.h[6]\n" - "fmla v15.8h, v7.8h, v1.h[6]\n" - "fmla v19.8h, v7.8h, v2.h[6]\n" - "fmla v23.8h, v7.8h, v3.h[6]\n" - "fmla v27.8h, v7.8h, v4.h[6]\n" - "ldr q7, [x10, #0x1d0]\n" - "fmla v8.8h, v6.8h, v0.h[7]\n" - "fmla v12.8h, v6.8h, v1.h[7]\n" - "fmla v16.8h, v6.8h, v2.h[7]\n" - "fmla v20.8h, v6.8h, v3.h[7]\n" - "fmla v24.8h, v6.8h, v4.h[7]\n" - "ldr q6, [x10, #0x1e0]\n" - "fmla v9.8h, v7.8h, v0.h[7]\n" - "fmla v13.8h, v7.8h, v1.h[7]\n" - "fmla v17.8h, v7.8h, v2.h[7]\n" - "fmla v21.8h, v7.8h, v3.h[7]\n" - "fmla v25.8h, v7.8h, v4.h[7]\n" - "ldr q7, [x10, #0x1f0]\n" - "add x10, x10, #0x200\n" - "fmla v10.8h, v6.8h, v0.h[7]\n" - "fmla v14.8h, v6.8h, v1.h[7]\n" - "fmla v18.8h, v6.8h, v2.h[7]\n" - "fmla v22.8h, v6.8h, v3.h[7]\n" - "fmla v26.8h, v6.8h, v4.h[7]\n" - "fmla v11.8h, v7.8h, v0.h[7]\n" - "fmla v15.8h, v7.8h, v1.h[7]\n" - "fmla v19.8h, v7.8h, v2.h[7]\n" - "fmla v23.8h, v7.8h, v3.h[7]\n" - "fmla v27.8h, v7.8h, v4.h[7]\n" - "224:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 226f\n" - "225:" // Height 5: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" - "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr h4, [x22], #0x2\n" + "mov x20, #0xc\n" + "ld1 { v16.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "ld1 { v24.s }[2], [x22], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x25]\n" + "ld1 { v16.h }[6], [x24]\n" + "ld1 { v20.h }[6], [x23]\n" + "ld1 { v24.h }[6], [x22]\n" + "b 215f\n" + "212:" // Height 5: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x25]\n" + "ld1 { v16.h }[4], [x24]\n" + "ld1 { v20.h }[4], [x23]\n" + "ld1 { v24.h }[4], [x22]\n" + "b 215f\n" + "213:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 214f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x25], #0x4\n" + "mov x20, #0x4\n" + "ldr s16, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "ldr s24, [x22], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x25]\n" + "ld1 { v16.h }[2], [x24]\n" + "ld1 { v20.h }[2], [x23]\n" + "ld1 { v24.h }[2], [x22]\n" + "b 215f\n" + "214:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x25, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x24, #0x0]\n" + "ldr h20, [x23, #0x0]\n" + "ldr h24, [x22, #0x0]\n" + "215:" // Height 5: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 218f\n" + "216:" // Height 5: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x25, #0x0]\n" + "ldr q13, [x25, #0x10]\n" + "ldr q14, [x25, #0x20]\n" + "ldr q15, [x25, #0x30]\n" + "ldr q16, [x24, #0x0]\n" + "ldr q17, [x24, #0x10]\n" + "ldr q18, [x24, #0x20]\n" + "ldr q19, [x24, #0x30]\n" + "ldr q20, [x23, #0x0]\n" + "ldr q21, [x23, #0x10]\n" + "ldr q22, [x23, #0x20]\n" + "ldr q23, [x23, #0x30]\n" + "ldr q24, [x22, #0x0]\n" + "ldr q25, [x22, #0x10]\n" + "ldr q26, [x22, #0x20]\n" + "ldr q27, [x22, #0x30]\n" + "b 218f\n" + "217:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "218:" // Height 5: setup done + "mov x28, #0x0\n" + "219:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 220f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 221f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "b 221f\n" + "220:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "221:" // Height 5: input setup done + "cmp x27, #0x8\n" + "blt 224f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 223f\n" + "222:" // Height 5: Multiply loop: Main loop head "fmla v8.8h, v6.8h, v0.h[0]\n" "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x10\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr q28, [x10, #0x50]\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x70]\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x80]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x90]\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xa0]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xb0]\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xc0]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xd0]\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0xe0]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0xf0]\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0x100]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0x110]\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x120]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x10, #0x130]\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x140]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x10, #0x150]\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x160]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x10, #0x170]\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x180]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x10, #0x190]\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x1a0]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x10, #0x1b0]\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x1c0]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x10, #0x1d0]\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr q29, [x10, #0x1e0]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr q28, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" + "bge 222b\n" + "223:" // Height 5: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" "fmla v16.8h, v6.8h, v2.h[0]\n" "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" "fmla v24.8h, v6.8h, v4.h[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q29, [x10, #0x20]\n" "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" "fmla v13.8h, v7.8h, v1.h[0]\n" "fmla v17.8h, v7.8h, v2.h[0]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" "fmla v21.8h, v7.8h, v3.h[0]\n" "fmla v25.8h, v7.8h, v4.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v29.8h, v0.h[0]\n" + "fmla v14.8h, v29.8h, v1.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v3.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.8h, v29.8h, v4.h[0]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v11.8h, v28.8h, v0.h[0]\n" + "fmla v15.8h, v28.8h, v1.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v3.h[0]\n" + "fmla v27.8h, v28.8h, v4.h[0]\n" + "ldr q28, [x10, #0x50]\n" + "fmla v8.8h, v29.8h, v0.h[1]\n" + "fmla v12.8h, v29.8h, v1.h[1]\n" + "fmla v16.8h, v29.8h, v2.h[1]\n" + "fmla v20.8h, v29.8h, v3.h[1]\n" + "fmla v24.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.8h, v28.8h, v0.h[1]\n" + "fmla v13.8h, v28.8h, v1.h[1]\n" + "fmla v17.8h, v28.8h, v2.h[1]\n" + "fmla v21.8h, v28.8h, v3.h[1]\n" + "fmla v25.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x70]\n" + "fmla v10.8h, v29.8h, v0.h[1]\n" + "fmla v14.8h, v29.8h, v1.h[1]\n" + "fmla v18.8h, v29.8h, v2.h[1]\n" + "fmla v22.8h, v29.8h, v3.h[1]\n" + "fmla v26.8h, v29.8h, v4.h[1]\n" + "ldr q29, [x10, #0x80]\n" + "fmla v11.8h, v28.8h, v0.h[1]\n" + "fmla v15.8h, v28.8h, v1.h[1]\n" + "fmla v19.8h, v28.8h, v2.h[1]\n" + "fmla v23.8h, v28.8h, v3.h[1]\n" + "fmla v27.8h, v28.8h, v4.h[1]\n" + "ldr q28, [x10, #0x90]\n" + "fmla v8.8h, v29.8h, v0.h[2]\n" + "fmla v12.8h, v29.8h, v1.h[2]\n" + "fmla v16.8h, v29.8h, v2.h[2]\n" + "fmla v20.8h, v29.8h, v3.h[2]\n" + "fmla v24.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xa0]\n" + "fmla v9.8h, v28.8h, v0.h[2]\n" + "fmla v13.8h, v28.8h, v1.h[2]\n" + "fmla v17.8h, v28.8h, v2.h[2]\n" + "fmla v21.8h, v28.8h, v3.h[2]\n" + "fmla v25.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xb0]\n" + "fmla v10.8h, v29.8h, v0.h[2]\n" + "fmla v14.8h, v29.8h, v1.h[2]\n" + "fmla v18.8h, v29.8h, v2.h[2]\n" + "fmla v22.8h, v29.8h, v3.h[2]\n" + "fmla v26.8h, v29.8h, v4.h[2]\n" + "ldr q29, [x10, #0xc0]\n" + "fmla v11.8h, v28.8h, v0.h[2]\n" + "fmla v15.8h, v28.8h, v1.h[2]\n" + "fmla v19.8h, v28.8h, v2.h[2]\n" + "fmla v23.8h, v28.8h, v3.h[2]\n" + "fmla v27.8h, v28.8h, v4.h[2]\n" + "ldr q28, [x10, #0xd0]\n" + "fmla v8.8h, v29.8h, v0.h[3]\n" + "fmla v12.8h, v29.8h, v1.h[3]\n" + "fmla v16.8h, v29.8h, v2.h[3]\n" + "fmla v20.8h, v29.8h, v3.h[3]\n" + "fmla v24.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0xe0]\n" + "fmla v9.8h, v28.8h, v0.h[3]\n" + "fmla v13.8h, v28.8h, v1.h[3]\n" + "fmla v17.8h, v28.8h, v2.h[3]\n" + "fmla v21.8h, v28.8h, v3.h[3]\n" + "fmla v25.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0xf0]\n" + "fmla v10.8h, v29.8h, v0.h[3]\n" + "fmla v14.8h, v29.8h, v1.h[3]\n" + "fmla v18.8h, v29.8h, v2.h[3]\n" + "fmla v22.8h, v29.8h, v3.h[3]\n" + "fmla v26.8h, v29.8h, v4.h[3]\n" + "ldr q29, [x10, #0x100]\n" + "fmla v11.8h, v28.8h, v0.h[3]\n" + "fmla v15.8h, v28.8h, v1.h[3]\n" + "fmla v19.8h, v28.8h, v2.h[3]\n" + "fmla v23.8h, v28.8h, v3.h[3]\n" + "fmla v27.8h, v28.8h, v4.h[3]\n" + "ldr q28, [x10, #0x110]\n" + "fmla v8.8h, v29.8h, v0.h[4]\n" + "fmla v12.8h, v29.8h, v1.h[4]\n" + "fmla v16.8h, v29.8h, v2.h[4]\n" + "fmla v20.8h, v29.8h, v3.h[4]\n" + "fmla v24.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x120]\n" + "fmla v9.8h, v28.8h, v0.h[4]\n" + "fmla v13.8h, v28.8h, v1.h[4]\n" + "fmla v17.8h, v28.8h, v2.h[4]\n" + "fmla v21.8h, v28.8h, v3.h[4]\n" + "fmla v25.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x10, #0x130]\n" + "fmla v10.8h, v29.8h, v0.h[4]\n" + "fmla v14.8h, v29.8h, v1.h[4]\n" + "fmla v18.8h, v29.8h, v2.h[4]\n" + "fmla v22.8h, v29.8h, v3.h[4]\n" + "fmla v26.8h, v29.8h, v4.h[4]\n" + "ldr q29, [x10, #0x140]\n" + "fmla v11.8h, v28.8h, v0.h[4]\n" + "fmla v15.8h, v28.8h, v1.h[4]\n" + "fmla v19.8h, v28.8h, v2.h[4]\n" + "fmla v23.8h, v28.8h, v3.h[4]\n" + "fmla v27.8h, v28.8h, v4.h[4]\n" + "ldr q28, [x10, #0x150]\n" + "fmla v8.8h, v29.8h, v0.h[5]\n" + "fmla v12.8h, v29.8h, v1.h[5]\n" + "fmla v16.8h, v29.8h, v2.h[5]\n" + "fmla v20.8h, v29.8h, v3.h[5]\n" + "fmla v24.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x160]\n" + "fmla v9.8h, v28.8h, v0.h[5]\n" + "fmla v13.8h, v28.8h, v1.h[5]\n" + "fmla v17.8h, v28.8h, v2.h[5]\n" + "fmla v21.8h, v28.8h, v3.h[5]\n" + "fmla v25.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x10, #0x170]\n" + "fmla v10.8h, v29.8h, v0.h[5]\n" + "fmla v14.8h, v29.8h, v1.h[5]\n" + "fmla v18.8h, v29.8h, v2.h[5]\n" + "fmla v22.8h, v29.8h, v3.h[5]\n" + "fmla v26.8h, v29.8h, v4.h[5]\n" + "ldr q29, [x10, #0x180]\n" + "fmla v11.8h, v28.8h, v0.h[5]\n" + "fmla v15.8h, v28.8h, v1.h[5]\n" + "fmla v19.8h, v28.8h, v2.h[5]\n" + "fmla v23.8h, v28.8h, v3.h[5]\n" + "fmla v27.8h, v28.8h, v4.h[5]\n" + "ldr q28, [x10, #0x190]\n" + "fmla v8.8h, v29.8h, v0.h[6]\n" + "fmla v12.8h, v29.8h, v1.h[6]\n" + "fmla v16.8h, v29.8h, v2.h[6]\n" + "fmla v20.8h, v29.8h, v3.h[6]\n" + "fmla v24.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x1a0]\n" + "fmla v9.8h, v28.8h, v0.h[6]\n" + "fmla v13.8h, v28.8h, v1.h[6]\n" + "fmla v17.8h, v28.8h, v2.h[6]\n" + "fmla v21.8h, v28.8h, v3.h[6]\n" + "fmla v25.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x10, #0x1b0]\n" + "fmla v10.8h, v29.8h, v0.h[6]\n" + "fmla v14.8h, v29.8h, v1.h[6]\n" + "fmla v18.8h, v29.8h, v2.h[6]\n" + "fmla v22.8h, v29.8h, v3.h[6]\n" + "fmla v26.8h, v29.8h, v4.h[6]\n" + "ldr q29, [x10, #0x1c0]\n" + "fmla v11.8h, v28.8h, v0.h[6]\n" + "fmla v15.8h, v28.8h, v1.h[6]\n" + "fmla v19.8h, v28.8h, v2.h[6]\n" + "fmla v23.8h, v28.8h, v3.h[6]\n" + "fmla v27.8h, v28.8h, v4.h[6]\n" + "ldr q28, [x10, #0x1d0]\n" + "fmla v8.8h, v29.8h, v0.h[7]\n" + "fmla v12.8h, v29.8h, v1.h[7]\n" + "fmla v16.8h, v29.8h, v2.h[7]\n" + "fmla v20.8h, v29.8h, v3.h[7]\n" + "fmla v24.8h, v29.8h, v4.h[7]\n" + "ldr q29, [x10, #0x1e0]\n" + "fmla v9.8h, v28.8h, v0.h[7]\n" + "fmla v13.8h, v28.8h, v1.h[7]\n" + "fmla v17.8h, v28.8h, v2.h[7]\n" + "fmla v21.8h, v28.8h, v3.h[7]\n" + "fmla v25.8h, v28.8h, v4.h[7]\n" + "ldr q28, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v29.8h, v0.h[7]\n" + "fmla v14.8h, v29.8h, v1.h[7]\n" + "fmla v18.8h, v29.8h, v2.h[7]\n" + "fmla v22.8h, v29.8h, v3.h[7]\n" + "fmla v26.8h, v29.8h, v4.h[7]\n" + "fmla v11.8h, v28.8h, v0.h[7]\n" + "fmla v15.8h, v28.8h, v1.h[7]\n" + "fmla v19.8h, v28.8h, v2.h[7]\n" + "fmla v23.8h, v28.8h, v3.h[7]\n" + "fmla v27.8h, v28.8h, v4.h[7]\n" + "224:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 226f\n" + "225:" // Height 5: Multiply loop: Odd block loop + "ldr h4, [x26], #0x2\n" + "ldr h3, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h1, [x23], #0x2\n" + "ldr h0, [x22], #0x2\n" + "ldr q29, [x10, #0x0]\n" + "fmla v8.8h, v29.8h, v4.h[0]\n" + "fmla v12.8h, v29.8h, v3.h[0]\n" + "ldr q28, [x10, #0x10]\n" + "fmla v16.8h, v29.8h, v2.h[0]\n" + "fmla v20.8h, v29.8h, v1.h[0]\n" + "fmla v24.8h, v29.8h, v0.h[0]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.8h, v28.8h, v4.h[0]\n" + "fmla v13.8h, v28.8h, v3.h[0]\n" + "fmla v17.8h, v28.8h, v2.h[0]\n" + "fmla v21.8h, v28.8h, v1.h[0]\n" + "fmla v25.8h, v28.8h, v0.h[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v10.8h, v29.8h, v4.h[0]\n" + "fmla v14.8h, v29.8h, v3.h[0]\n" + "fmla v18.8h, v29.8h, v2.h[0]\n" + "fmla v22.8h, v29.8h, v1.h[0]\n" + "fmla v26.8h, v29.8h, v0.h[0]\n" + "fmla v11.8h, v28.8h, v4.h[0]\n" + "fmla v15.8h, v28.8h, v3.h[0]\n" + "fmla v19.8h, v28.8h, v2.h[0]\n" + "fmla v23.8h, v28.8h, v1.h[0]\n" + "fmla v27.8h, v28.8h, v0.h[0]\n" "cbnz x27, 225b\n" "226:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -3719,49 +3719,49 @@ void a64_hybrid_fp16_mla_6x32 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 227f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.8h }, [x20]\n" + "ld1r { v29.8h }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.8h }, [x20]\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fmin v27.8h, v27.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmax v27.8h, v27.8h, v0.8h\n" + "ld1r { v28.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v29.8h\n" + "fmin v9.8h, v9.8h, v29.8h\n" + "fmin v10.8h, v10.8h, v29.8h\n" + "fmin v11.8h, v11.8h, v29.8h\n" + "fmin v12.8h, v12.8h, v29.8h\n" + "fmin v13.8h, v13.8h, v29.8h\n" + "fmin v14.8h, v14.8h, v29.8h\n" + "fmin v15.8h, v15.8h, v29.8h\n" + "fmin v16.8h, v16.8h, v29.8h\n" + "fmin v17.8h, v17.8h, v29.8h\n" + "fmin v18.8h, v18.8h, v29.8h\n" + "fmin v19.8h, v19.8h, v29.8h\n" + "fmin v20.8h, v20.8h, v29.8h\n" + "fmin v21.8h, v21.8h, v29.8h\n" + "fmin v22.8h, v22.8h, v29.8h\n" + "fmin v23.8h, v23.8h, v29.8h\n" + "fmin v24.8h, v24.8h, v29.8h\n" + "fmin v25.8h, v25.8h, v29.8h\n" + "fmin v26.8h, v26.8h, v29.8h\n" + "fmin v27.8h, v27.8h, v29.8h\n" + "fmax v8.8h, v8.8h, v28.8h\n" + "fmax v9.8h, v9.8h, v28.8h\n" + "fmax v10.8h, v10.8h, v28.8h\n" + "fmax v11.8h, v11.8h, v28.8h\n" + "fmax v12.8h, v12.8h, v28.8h\n" + "fmax v13.8h, v13.8h, v28.8h\n" + "fmax v14.8h, v14.8h, v28.8h\n" + "fmax v15.8h, v15.8h, v28.8h\n" + "fmax v16.8h, v16.8h, v28.8h\n" + "fmax v17.8h, v17.8h, v28.8h\n" + "fmax v18.8h, v18.8h, v28.8h\n" + "fmax v19.8h, v19.8h, v28.8h\n" + "fmax v20.8h, v20.8h, v28.8h\n" + "fmax v21.8h, v21.8h, v28.8h\n" + "fmax v22.8h, v22.8h, v28.8h\n" + "fmax v23.8h, v23.8h, v28.8h\n" + "fmax v24.8h, v24.8h, v28.8h\n" + "fmax v25.8h, v25.8h, v28.8h\n" + "fmax v26.8h, v26.8h, v28.8h\n" + "fmax v27.8h, v27.8h, v28.8h\n" "227:" // Height 5: No activation "cmp x11, #0x20\n" "bge 244f\n" @@ -4386,16 +4386,16 @@ void a64_hybrid_fp16_mla_6x32 ( "268:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 269f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 270f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -4407,11 +4407,11 @@ void a64_hybrid_fp16_mla_6x32 ( "b 270f\n" "269:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "270:" // Height 6: input setup done "cmp x27, #0x8\n" "blt 273f\n" @@ -4912,42 +4912,42 @@ void a64_hybrid_fp16_mla_6x32 ( "273:" // Height 6: Multiply loop: Main loop skip "cbz x27, 275f\n" "274:" // Height 6: Multiply loop: Odd block loop - "ldr h0, [x26], #0x2\n" - "ldr h1, [x25], #0x2\n" + "ldr h7, [x26], #0x2\n" + "ldr h6, [x25], #0x2\n" "sub x27, x27, #0x1\n" - "ldr h2, [x24], #0x2\n" - "ldr h3, [x23], #0x2\n" - "ldr h4, [x22], #0x2\n" - "ldr h5, [x21], #0x2\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[0]\n" - "fmla v16.8h, v6.8h, v2.h[0]\n" - "fmla v20.8h, v6.8h, v3.h[0]\n" - "fmla v24.8h, v6.8h, v4.h[0]\n" - "fmla v28.8h, v6.8h, v5.h[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[0]\n" - "fmla v17.8h, v7.8h, v2.h[0]\n" - "fmla v21.8h, v7.8h, v3.h[0]\n" - "fmla v25.8h, v7.8h, v4.h[0]\n" - "fmla v29.8h, v7.8h, v5.h[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr h5, [x24], #0x2\n" + "ldr h4, [x23], #0x2\n" + "ldr h3, [x22], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + "fmla v8.8h, v1.8h, v7.h[0]\n" + "fmla v12.8h, v1.8h, v6.h[0]\n" + "fmla v16.8h, v1.8h, v5.h[0]\n" + "fmla v20.8h, v1.8h, v4.h[0]\n" + "fmla v24.8h, v1.8h, v3.h[0]\n" + "fmla v28.8h, v1.8h, v2.h[0]\n" + "ldr q1, [x10, #0x20]\n" + "fmla v9.8h, v0.8h, v7.h[0]\n" + "fmla v13.8h, v0.8h, v6.h[0]\n" + "fmla v17.8h, v0.8h, v5.h[0]\n" + "fmla v21.8h, v0.8h, v4.h[0]\n" + "fmla v25.8h, v0.8h, v3.h[0]\n" + "fmla v29.8h, v0.8h, v2.h[0]\n" + "ldr q0, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v14.8h, v6.8h, v1.h[0]\n" - "fmla v18.8h, v6.8h, v2.h[0]\n" - "fmla v22.8h, v6.8h, v3.h[0]\n" - "fmla v26.8h, v6.8h, v4.h[0]\n" - "fmla v30.8h, v6.8h, v5.h[0]\n" - "fmla v11.8h, v7.8h, v0.h[0]\n" - "fmla v15.8h, v7.8h, v1.h[0]\n" - "fmla v19.8h, v7.8h, v2.h[0]\n" - "fmla v23.8h, v7.8h, v3.h[0]\n" - "fmla v27.8h, v7.8h, v4.h[0]\n" - "fmla v31.8h, v7.8h, v5.h[0]\n" + "fmla v10.8h, v1.8h, v7.h[0]\n" + "fmla v14.8h, v1.8h, v6.h[0]\n" + "fmla v18.8h, v1.8h, v5.h[0]\n" + "fmla v22.8h, v1.8h, v4.h[0]\n" + "fmla v26.8h, v1.8h, v3.h[0]\n" + "fmla v30.8h, v1.8h, v2.h[0]\n" + "fmla v11.8h, v0.8h, v7.h[0]\n" + "fmla v15.8h, v0.8h, v6.h[0]\n" + "fmla v19.8h, v0.8h, v5.h[0]\n" + "fmla v23.8h, v0.8h, v4.h[0]\n" + "fmla v27.8h, v0.8h, v3.h[0]\n" + "fmla v31.8h, v0.8h, v2.h[0]\n" "cbnz x27, 274b\n" "275:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -5317,7 +5317,6 @@ void a64_hybrid_fp16_mla_6x32 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "296:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp index e155bfb111..171929e65e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -113,5 +113,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp index 700d803f82..9ceda8fd0c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_4x24_a55 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 124f\n" @@ -223,11 +222,11 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "19:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w12, [x20, x13, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x11, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x11, [x20, #0x0]\n" "cbnz x13, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x11, x11, x20, LSL #2\n" @@ -246,176 +245,176 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "blt 23f\n" "22:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr d4, [x15, #0x40]\n" - "ldr x10, [x15, #0x48]\n" + "ldr d19, [x15, #0x40]\n" + "ldr x20, [x15, #0x48]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr d5, [x15, #0x50]\n" + "ldr d18, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr d6, [x15, #0x60]\n" + "ldr d17, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr d7, [x15, #0x70]\n" - "mov v4.d[1], x10\n" - "ldr x9, [x15, #0x58]\n" - "mov v5.d[1], x9\n" - "ldr x28, [x15, #0x68]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0x78]\n" - "mov v7.d[1], x27\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr d4, [x15, #0x80]\n" - "ldr x10, [x15, #0x88]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr d5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr d6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v4.d[1], x10\n" - "ldr x9, [x15, #0x98]\n" - "mov v5.d[1], x9\n" - "ldr x28, [x15, #0xa8]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0xb8]\n" - "mov v7.d[1], x27\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "ldr d4, [x15, #0xc0]\n" - "ldr x10, [x15, #0xc8]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr d5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "ldr d6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v4.d[1], x10\n" - "ldr x9, [x15, #0xd8]\n" - "mov v5.d[1], x9\n" - "ldr x28, [x15, #0xe8]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0xf8]\n" - "mov v7.d[1], x27\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "ldr d4, [x15, #0x100]\n" - "ldr x10, [x15, #0x108]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr d5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr d6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr d7, [x15, #0x130]\n" - "mov v4.d[1], x10\n" - "ldr x9, [x15, #0x118]\n" - "mov v5.d[1], x9\n" - "ldr x28, [x15, #0x128]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0x138]\n" - "mov v7.d[1], x27\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "ldr d4, [x15, #0x140]\n" - "ldr x10, [x15, #0x148]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr d5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr d6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr d7, [x15, #0x170]\n" - "mov v4.d[1], x10\n" - "ldr x9, [x15, #0x158]\n" - "mov v5.d[1], x9\n" - "ldr x28, [x15, #0x168]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0x178]\n" - "mov v7.d[1], x27\n" + "ldr d16, [x15, #0x70]\n" + "mov v19.d[1], x20\n" + "ldr x20, [x15, #0x58]\n" + "mov v18.d[1], x20\n" + "ldr x20, [x15, #0x68]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x78]\n" + "mov v16.d[1], x20\n" + "fmla v12.4s, v19.4s, v0.s[0]\n" + "ldr d19, [x15, #0x80]\n" + "ldr x20, [x15, #0x88]\n" + "fmla v13.4s, v18.4s, v0.s[0]\n" + "ldr d18, [x15, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr d17, [x15, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr d16, [x15, #0xb0]\n" + "mov v19.d[1], x20\n" + "ldr x20, [x15, #0x98]\n" + "mov v18.d[1], x20\n" + "ldr x20, [x15, #0xa8]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xb8]\n" + "mov v16.d[1], x20\n" + "fmla v10.4s, v19.4s, v0.s[1]\n" + "ldr d19, [x15, #0xc0]\n" + "ldr x20, [x15, #0xc8]\n" + "fmla v11.4s, v18.4s, v0.s[1]\n" + "ldr d18, [x15, #0xd0]\n" + "fmla v12.4s, v17.4s, v0.s[1]\n" + "ldr d17, [x15, #0xe0]\n" + "fmla v13.4s, v16.4s, v0.s[1]\n" + "ldr d16, [x15, #0xf0]\n" + "mov v19.d[1], x20\n" + "ldr x20, [x15, #0xd8]\n" + "mov v18.d[1], x20\n" + "ldr x20, [x15, #0xe8]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xf8]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v19.4s, v0.s[2]\n" + "ldr d19, [x15, #0x100]\n" + "ldr x20, [x15, #0x108]\n" + "fmla v9.4s, v18.4s, v0.s[2]\n" + "ldr d18, [x15, #0x110]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr d17, [x15, #0x120]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr d16, [x15, #0x130]\n" + "mov v19.d[1], x20\n" + "ldr x20, [x15, #0x118]\n" + "mov v18.d[1], x20\n" + "ldr x20, [x15, #0x128]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x138]\n" + "mov v16.d[1], x20\n" + "fmla v12.4s, v19.4s, v0.s[2]\n" + "ldr d19, [x15, #0x140]\n" + "ldr x20, [x15, #0x148]\n" + "fmla v13.4s, v18.4s, v0.s[2]\n" + "ldr d18, [x15, #0x150]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr d17, [x15, #0x160]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr d16, [x15, #0x170]\n" + "mov v19.d[1], x20\n" + "ldr x20, [x15, #0x158]\n" + "mov v18.d[1], x20\n" + "ldr x20, [x15, #0x168]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x178]\n" + "mov v16.d[1], x20\n" "add x11, x11, #0x10\n" "add x15, x15, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v10.4s, v19.4s, v0.s[3]\n" "ldr d4, [x15, #0x0]\n" - "ldr x10, [x15, #0x8]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" + "ldr x20, [x15, #0x8]\n" + "fmla v11.4s, v18.4s, v0.s[3]\n" "ldr d5, [x15, #0x10]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v0.s[3]\n" "ldr d6, [x15, #0x20]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v0.s[3]\n" "ldr d0, [x11, #0x0]\n" "sub x12, x12, #0x4\n" "ldr d7, [x15, #0x30]\n" "cmp x12, #0x8\n" - "ldr x9, [x15, #0x18]\n" - "mov v4.d[1], x10\n" - "ldr x28, [x15, #0x28]\n" - "mov v5.d[1], x9\n" - "ldr x26, [x11, #0x8]\n" - "mov v6.d[1], x28\n" - "ldr x27, [x15, #0x38]\n" - "mov v0.d[1], x26\n" - "mov v7.d[1], x27\n" + "ldr x21, [x15, #0x18]\n" + "mov v4.d[1], x20\n" + "ldr x20, [x15, #0x28]\n" + "mov v5.d[1], x21\n" + "ldr x21, [x11, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x15, #0x38]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x11, #0x80]\n" "bge 22b\n" "23:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x15, #0x40]\n" + "ldr q19, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x15, #0x50]\n" + "ldr q18, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q17, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x15, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x15, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "ldr q4, [x15, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr q5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x15, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x15, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x15, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x15, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x15, #0x170]\n" + "ldr q16, [x15, #0x70]\n" + "fmla v12.4s, v19.4s, v0.s[0]\n" + "ldr q19, [x15, #0x80]\n" + "fmla v13.4s, v18.4s, v0.s[0]\n" + "ldr q18, [x15, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x15, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x15, #0xb0]\n" + "fmla v10.4s, v19.4s, v0.s[1]\n" + "ldr q19, [x15, #0xc0]\n" + "fmla v11.4s, v18.4s, v0.s[1]\n" + "ldr q18, [x15, #0xd0]\n" + "fmla v12.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x15, #0xe0]\n" + "fmla v13.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x15, #0xf0]\n" + "fmla v8.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x15, #0x100]\n" + "fmla v9.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x15, #0x110]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x15, #0x120]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x15, #0x130]\n" + "fmla v12.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x15, #0x140]\n" + "fmla v13.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x15, #0x150]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x15, #0x160]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x15, #0x170]\n" "add x11, x11, #0x10\n" "sub x12, x12, #0x4\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" + "fmla v10.4s, v19.4s, v0.s[3]\n" "prfm pldl1keep, [x11, #0x80]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v11.4s, v18.4s, v0.s[3]\n" "add x15, x15, #0x180\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v0.s[3]\n" "24:" // Height 1: Multiply loop: Main loop skip "cbz x12, 26f\n" "25:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" + "ldr s17, [x11], #0x4\n" "sub x12, x12, #0x1\n" - "ldr q4, [x15, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x15, #0x10]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x15, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q4, [x15, #0x40]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x15, #0x50]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q16, [x15, #0x0]\n" + "fmla v8.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x15, #0x10]\n" + "fmla v9.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x15, #0x20]\n" + "fmla v10.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x15, #0x30]\n" + "fmla v11.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x15, #0x40]\n" + "fmla v12.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x15, #0x50]\n" + "fmla v13.4s, v16.4s, v17.s[0]\n" "add x15, x15, #0x60\n" "cbnz x12, 25b\n" "26:" // Height 1: Multiply loop: No odd multiplies @@ -426,21 +425,21 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "prfm pstl1keep, [x14, #0x0]\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v16.4s\n" + "fmin v9.4s, v9.4s, v16.4s\n" + "fmin v10.4s, v10.4s, v16.4s\n" + "fmin v11.4s, v11.4s, v16.4s\n" + "fmin v12.4s, v12.4s, v16.4s\n" + "fmin v13.4s, v13.4s, v16.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" "27:" // Height 1: No activation "cmp x16, #0x18\n" "bge 40f\n" @@ -701,26 +700,26 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "60:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w12, [x20, x13, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x11, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x10, [x20, #0x8]\n" "cbnz x13, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x11, x11, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" "b 62f\n" "61:" // Height 2: setup direct input "mov x11, %x[input_ptr]\n" - "add x25, x11, x20, LSL #2\n" + "add x10, x11, x21, LSL #2\n" "62:" // Height 2: input setup done "cmp x12, #0x4\n" "blt 65f\n" "ldr q0, [x11, #0x0]\n" "cmp x12, #0x8\n" - "ldr q1, [x25, #0x0]\n" + "ldr q1, [x10, #0x0]\n" "ldr q4, [x15, #0x0]\n" "ldr q5, [x15, #0x10]\n" "ldr q6, [x15, #0x20]\n" @@ -728,239 +727,239 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "blt 64f\n" "63:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr x10, [x15, #0x48]\n" + "ldr x23, [x15, #0x48]\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr d4, [x15, #0x40]\n" + "ldr d23, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr x9, [x15, #0x58]\n" + "ldr x22, [x15, #0x58]\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr d5, [x15, #0x50]\n" + "ldr d22, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr x28, [x15, #0x68]\n" + "ldr x21, [x15, #0x68]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "ldr d6, [x15, #0x60]\n" + "ldr d21, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr x27, [x15, #0x78]\n" + "ldr x20, [x15, #0x78]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr d7, [x15, #0x70]\n" - "mov v4.d[1], x10\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "mov v5.d[1], x9\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr d4, [x15, #0x80]\n" - "mov v6.d[1], x28\n" - "mov v7.d[1], x27\n" - "ldr x10, [x15, #0x88]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr x9, [x15, #0x98]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr d5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr x28, [x15, #0xa8]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr d6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr x27, [x15, #0xb8]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v4.d[1], x10\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "mov v5.d[1], x9\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr d4, [x15, #0xc0]\n" - "mov v6.d[1], x28\n" - "mov v7.d[1], x27\n" - "ldr x10, [x15, #0xc8]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr x9, [x15, #0xd8]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr d5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "ldr x28, [x15, #0xe8]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "ldr d6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "ldr x27, [x15, #0xf8]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v4.d[1], x10\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "mov v5.d[1], x9\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr d4, [x15, #0x100]\n" - "mov v6.d[1], x28\n" - "mov v7.d[1], x27\n" - "ldr x10, [x15, #0x108]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr x9, [x15, #0x118]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr d5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr x28, [x15, #0x128]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "ldr d6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr x27, [x15, #0x138]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr d7, [x15, #0x130]\n" - "mov v4.d[1], x10\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "mov v5.d[1], x9\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr d4, [x15, #0x140]\n" - "mov v6.d[1], x28\n" - "mov v7.d[1], x27\n" - "ldr x10, [x15, #0x148]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr x9, [x15, #0x158]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "ldr d5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr x28, [x15, #0x168]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr d6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr x27, [x15, #0x178]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d7, [x15, #0x170]\n" - "mov v4.d[1], x10\n" + "ldr d20, [x15, #0x70]\n" + "mov v23.d[1], x23\n" + "fmla v12.4s, v23.4s, v0.s[0]\n" + "mov v22.d[1], x22\n" + "fmla v18.4s, v23.4s, v1.s[0]\n" + "ldr d23, [x15, #0x80]\n" + "mov v21.d[1], x21\n" + "mov v20.d[1], x20\n" + "ldr x23, [x15, #0x88]\n" + "fmla v13.4s, v22.4s, v0.s[0]\n" + "ldr x22, [x15, #0x98]\n" + "fmla v19.4s, v22.4s, v1.s[0]\n" + "ldr d22, [x15, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "ldr x21, [x15, #0xa8]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "ldr d21, [x15, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "ldr x20, [x15, #0xb8]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "ldr d20, [x15, #0xb0]\n" + "mov v23.d[1], x23\n" + "fmla v10.4s, v23.4s, v0.s[1]\n" + "mov v22.d[1], x22\n" + "fmla v16.4s, v23.4s, v1.s[1]\n" + "ldr d23, [x15, #0xc0]\n" + "mov v21.d[1], x21\n" + "mov v20.d[1], x20\n" + "ldr x23, [x15, #0xc8]\n" + "fmla v11.4s, v22.4s, v0.s[1]\n" + "ldr x22, [x15, #0xd8]\n" + "fmla v17.4s, v22.4s, v1.s[1]\n" + "ldr d22, [x15, #0xd0]\n" + "fmla v12.4s, v21.4s, v0.s[1]\n" + "ldr x21, [x15, #0xe8]\n" + "fmla v18.4s, v21.4s, v1.s[1]\n" + "ldr d21, [x15, #0xe0]\n" + "fmla v13.4s, v20.4s, v0.s[1]\n" + "ldr x20, [x15, #0xf8]\n" + "fmla v19.4s, v20.4s, v1.s[1]\n" + "ldr d20, [x15, #0xf0]\n" + "mov v23.d[1], x23\n" + "fmla v8.4s, v23.4s, v0.s[2]\n" + "mov v22.d[1], x22\n" + "fmla v14.4s, v23.4s, v1.s[2]\n" + "ldr d23, [x15, #0x100]\n" + "mov v21.d[1], x21\n" + "mov v20.d[1], x20\n" + "ldr x23, [x15, #0x108]\n" + "fmla v9.4s, v22.4s, v0.s[2]\n" + "ldr x22, [x15, #0x118]\n" + "fmla v15.4s, v22.4s, v1.s[2]\n" + "ldr d22, [x15, #0x110]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "ldr x21, [x15, #0x128]\n" + "fmla v16.4s, v21.4s, v1.s[2]\n" + "ldr d21, [x15, #0x120]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "ldr x20, [x15, #0x138]\n" + "fmla v17.4s, v20.4s, v1.s[2]\n" + "ldr d20, [x15, #0x130]\n" + "mov v23.d[1], x23\n" + "fmla v12.4s, v23.4s, v0.s[2]\n" + "mov v22.d[1], x22\n" + "fmla v18.4s, v23.4s, v1.s[2]\n" + "ldr d23, [x15, #0x140]\n" + "mov v21.d[1], x21\n" + "mov v20.d[1], x20\n" + "ldr x23, [x15, #0x148]\n" + "fmla v13.4s, v22.4s, v0.s[2]\n" + "ldr x22, [x15, #0x158]\n" + "fmla v19.4s, v22.4s, v1.s[2]\n" + "ldr d22, [x15, #0x150]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "ldr x21, [x15, #0x168]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "ldr d21, [x15, #0x160]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "ldr x20, [x15, #0x178]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "ldr d20, [x15, #0x170]\n" + "mov v23.d[1], x23\n" "add x11, x11, #0x10\n" - "mov v5.d[1], x9\n" - "add x25, x25, #0x10\n" - "mov v6.d[1], x28\n" + "mov v22.d[1], x22\n" + "add x10, x10, #0x10\n" + "mov v21.d[1], x21\n" "add x15, x15, #0x180\n" - "mov v7.d[1], x27\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" + "mov v20.d[1], x20\n" + "fmla v10.4s, v23.4s, v0.s[3]\n" + "fmla v16.4s, v23.4s, v1.s[3]\n" "ldr d4, [x15, #0x0]\n" - "ldr x10, [x15, #0x8]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" + "ldr x21, [x15, #0x8]\n" + "fmla v11.4s, v22.4s, v0.s[3]\n" + "fmla v17.4s, v22.4s, v1.s[3]\n" "ldr d5, [x15, #0x10]\n" - "ldr x9, [x15, #0x18]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" + "ldr x20, [x15, #0x18]\n" + "fmla v12.4s, v21.4s, v0.s[3]\n" + "fmla v18.4s, v21.4s, v1.s[3]\n" "ldr d6, [x15, #0x20]\n" - "ldr x28, [x15, #0x28]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "ldr x23, [x15, #0x28]\n" + "fmla v13.4s, v20.4s, v0.s[3]\n" "ldr d0, [x11, #0x0]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x25, #0x0]\n" + "fmla v19.4s, v20.4s, v1.s[3]\n" + "ldr d1, [x10, #0x0]\n" "sub x12, x12, #0x4\n" "ldr d7, [x15, #0x30]\n" "cmp x12, #0x8\n" - "ldr x26, [x11, #0x8]\n" - "mov v4.d[1], x10\n" - "ldr x24, [x25, #0x8]\n" - "mov v5.d[1], x9\n" - "ldr x27, [x15, #0x38]\n" - "mov v6.d[1], x28\n" + "ldr x22, [x11, #0x8]\n" + "mov v4.d[1], x21\n" + "ldr x21, [x10, #0x8]\n" + "mov v5.d[1], x20\n" + "ldr x20, [x15, #0x38]\n" + "mov v6.d[1], x23\n" "prfm pldl1keep, [x11, #0x80]\n" - "mov v0.d[1], x26\n" - "prfm pldl1keep, [x25, #0x80]\n" - "mov v1.d[1], x24\n" - "mov v7.d[1], x27\n" + "mov v0.d[1], x22\n" + "prfm pldl1keep, [x10, #0x80]\n" + "mov v1.d[1], x21\n" + "mov v7.d[1], x20\n" "bge 63b\n" "64:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" "add x11, x11, #0x10\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x15, #0x40]\n" + "ldr q23, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x15, #0x50]\n" + "ldr q22, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "sub x12, x12, #0x4\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q21, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "prfm pldl1keep, [x11, #0x80]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x15, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x15, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr q4, [x15, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr q5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x15, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x15, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x15, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x15, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x15, #0x170]\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" + "ldr q20, [x15, #0x70]\n" + "fmla v12.4s, v23.4s, v0.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v18.4s, v23.4s, v1.s[0]\n" + "ldr q23, [x15, #0x80]\n" + "fmla v13.4s, v22.4s, v0.s[0]\n" + "fmla v19.4s, v22.4s, v1.s[0]\n" + "ldr q22, [x15, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x15, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x15, #0xb0]\n" + "fmla v10.4s, v23.4s, v0.s[1]\n" + "fmla v16.4s, v23.4s, v1.s[1]\n" + "ldr q23, [x15, #0xc0]\n" + "fmla v11.4s, v22.4s, v0.s[1]\n" + "fmla v17.4s, v22.4s, v1.s[1]\n" + "ldr q22, [x15, #0xd0]\n" + "fmla v12.4s, v21.4s, v0.s[1]\n" + "fmla v18.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x15, #0xe0]\n" + "fmla v13.4s, v20.4s, v0.s[1]\n" + "fmla v19.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x15, #0xf0]\n" + "fmla v8.4s, v23.4s, v0.s[2]\n" + "fmla v14.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x15, #0x100]\n" + "fmla v9.4s, v22.4s, v0.s[2]\n" + "fmla v15.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x15, #0x110]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v16.4s, v21.4s, v1.s[2]\n" + "ldr q21, [x15, #0x120]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v17.4s, v20.4s, v1.s[2]\n" + "ldr q20, [x15, #0x130]\n" + "fmla v12.4s, v23.4s, v0.s[2]\n" + "fmla v18.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x15, #0x140]\n" + "fmla v13.4s, v22.4s, v0.s[2]\n" + "fmla v19.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x15, #0x150]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "ldr q21, [x15, #0x160]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "ldr q20, [x15, #0x170]\n" + "fmla v10.4s, v23.4s, v0.s[3]\n" "add x15, x15, #0x180\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v16.4s, v23.4s, v1.s[3]\n" + "fmla v11.4s, v22.4s, v0.s[3]\n" + "fmla v17.4s, v22.4s, v1.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[3]\n" + "fmla v18.4s, v21.4s, v1.s[3]\n" + "fmla v13.4s, v20.4s, v0.s[3]\n" + "fmla v19.4s, v20.4s, v1.s[3]\n" "65:" // Height 2: Multiply loop: Main loop skip "cbz x12, 67f\n" "66:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" + "ldr s25, [x11], #0x4\n" "sub x12, x12, #0x1\n" - "ldr s1, [x25], #0x4\n" - "ldr q4, [x15, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x15, #0x10]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q7, [x15, #0x30]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr q4, [x15, #0x40]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q5, [x15, #0x50]\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr s24, [x10], #0x4\n" + "ldr q21, [x15, #0x0]\n" + "fmla v8.4s, v21.4s, v25.s[0]\n" + "ldr q20, [x15, #0x10]\n" + "fmla v14.4s, v21.4s, v24.s[0]\n" + "ldr q23, [x15, #0x20]\n" + "fmla v9.4s, v20.4s, v25.s[0]\n" + "ldr q22, [x15, #0x30]\n" + "fmla v15.4s, v20.4s, v24.s[0]\n" + "ldr q21, [x15, #0x40]\n" + "fmla v10.4s, v23.4s, v25.s[0]\n" + "ldr q20, [x15, #0x50]\n" + "fmla v16.4s, v23.4s, v24.s[0]\n" + "fmla v11.4s, v22.4s, v25.s[0]\n" "add x15, x15, #0x60\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v17.4s, v22.4s, v24.s[0]\n" + "fmla v12.4s, v21.4s, v25.s[0]\n" + "fmla v18.4s, v21.4s, v24.s[0]\n" + "fmla v13.4s, v20.4s, v25.s[0]\n" + "fmla v19.4s, v20.4s, v24.s[0]\n" "cbnz x12, 66b\n" "67:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -973,33 +972,33 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 68f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" - "fmin v16.4s, v16.4s, v0.4s\n" - "fmin v17.4s, v17.4s, v0.4s\n" - "fmin v18.4s, v18.4s, v0.4s\n" - "fmin v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v20.4s\n" + "fmin v9.4s, v9.4s, v20.4s\n" + "fmin v10.4s, v10.4s, v20.4s\n" + "fmin v11.4s, v11.4s, v20.4s\n" + "fmin v12.4s, v12.4s, v20.4s\n" + "fmin v13.4s, v13.4s, v20.4s\n" + "fmin v14.4s, v14.4s, v20.4s\n" + "fmin v15.4s, v15.4s, v20.4s\n" + "fmin v16.4s, v16.4s, v20.4s\n" + "fmin v17.4s, v17.4s, v20.4s\n" + "fmin v18.4s, v18.4s, v20.4s\n" + "fmin v19.4s, v19.4s, v20.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "68:" // Height 2: No activation "cmp x16, #0x18\n" "bge 81f\n" @@ -1339,30 +1338,30 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "101:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w12, [x20, x13, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 102f\n" - "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x11, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x23, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x10, [x20, #0x8]\n" + "ldr x9, [x20, #0x10]\n" "cbnz x13, 103f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x11, x11, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" + "add x9, x9, x20, LSL #2\n" "b 103f\n" "102:" // Height 3: setup direct input "mov x11, %x[input_ptr]\n" - "add x25, x11, x20, LSL #2\n" - "add x23, x25, x20, LSL #2\n" + "add x10, x11, x21, LSL #2\n" + "add x9, x10, x21, LSL #2\n" "103:" // Height 3: input setup done "cmp x12, #0x4\n" "blt 106f\n" "ldr q0, [x11, #0x0]\n" "cmp x12, #0x8\n" - "ldr q1, [x25, #0x0]\n" - "ldr q2, [x23, #0x0]\n" + "ldr q1, [x10, #0x0]\n" + "ldr q2, [x9, #0x0]\n" "ldr q4, [x15, #0x0]\n" "ldr q5, [x15, #0x10]\n" "ldr q6, [x15, #0x20]\n" @@ -1370,301 +1369,301 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "blt 105f\n" "104:" // Height 3: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr x10, [x15, #0x48]\n" + "ldr x23, [x15, #0x48]\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr x9, [x15, #0x58]\n" + "ldr x22, [x15, #0x58]\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr d4, [x15, #0x40]\n" + "ldr d29, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr x28, [x15, #0x68]\n" + "ldr x21, [x15, #0x68]\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr x27, [x15, #0x78]\n" + "ldr x20, [x15, #0x78]\n" "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr d5, [x15, #0x50]\n" + "ldr d28, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "mov v4.d[1], x10\n" + "mov v29.d[1], x23\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "mov v5.d[1], x9\n" + "mov v28.d[1], x22\n" "fmla v22.4s, v6.4s, v2.s[0]\n" - "ldr d6, [x15, #0x60]\n" + "ldr d27, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x28\n" + "mov v27.d[1], x21\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr x10, [x15, #0x88]\n" + "ldr x23, [x15, #0x88]\n" "fmla v23.4s, v7.4s, v2.s[0]\n" - "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x27\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr x9, [x15, #0x98]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "ldr d4, [x15, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr x28, [x15, #0xa8]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr x27, [x15, #0xb8]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "ldr d5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "mov v4.d[1], x10\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "mov v5.d[1], x9\n" - "fmla v20.4s, v6.4s, v2.s[1]\n" - "ldr d6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x28\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x10, [x15, #0xc8]\n" - "fmla v21.4s, v7.4s, v2.s[1]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x27\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr x9, [x15, #0xd8]\n" - "fmla v22.4s, v4.4s, v2.s[1]\n" - "ldr d4, [x15, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr x28, [x15, #0xe8]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr x27, [x15, #0xf8]\n" - "fmla v23.4s, v5.4s, v2.s[1]\n" - "ldr d5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "mov v4.d[1], x10\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "mov v5.d[1], x9\n" - "fmla v24.4s, v6.4s, v2.s[1]\n" - "ldr d6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x28\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr x10, [x15, #0x108]\n" - "fmla v25.4s, v7.4s, v2.s[1]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x27\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr x9, [x15, #0x118]\n" - "fmla v20.4s, v4.4s, v2.s[2]\n" - "ldr d4, [x15, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr x28, [x15, #0x128]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr x27, [x15, #0x138]\n" - "fmla v21.4s, v5.4s, v2.s[2]\n" - "ldr d5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "mov v4.d[1], x10\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "mov v5.d[1], x9\n" - "fmla v22.4s, v6.4s, v2.s[2]\n" - "ldr d6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x28\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr x10, [x15, #0x148]\n" - "fmla v23.4s, v7.4s, v2.s[2]\n" - "ldr d7, [x15, #0x130]\n" - "mov v7.d[1], x27\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr x9, [x15, #0x158]\n" - "fmla v24.4s, v4.4s, v2.s[2]\n" - "ldr d4, [x15, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr x28, [x15, #0x168]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "ldr x27, [x15, #0x178]\n" - "fmla v25.4s, v5.4s, v2.s[2]\n" - "ldr d5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "mov v4.d[1], x10\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "mov v5.d[1], x9\n" - "fmla v20.4s, v6.4s, v2.s[3]\n" - "ldr d6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x28\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "ldr d26, [x15, #0x70]\n" + "mov v26.d[1], x20\n" + "fmla v12.4s, v29.4s, v0.s[0]\n" + "fmla v18.4s, v29.4s, v1.s[0]\n" + "ldr x22, [x15, #0x98]\n" + "fmla v24.4s, v29.4s, v2.s[0]\n" + "ldr d29, [x15, #0x80]\n" + "fmla v13.4s, v28.4s, v0.s[0]\n" + "ldr x21, [x15, #0xa8]\n" + "fmla v19.4s, v28.4s, v1.s[0]\n" + "ldr x20, [x15, #0xb8]\n" + "fmla v25.4s, v28.4s, v2.s[0]\n" + "ldr d28, [x15, #0x90]\n" + "fmla v8.4s, v27.4s, v0.s[1]\n" + "mov v29.d[1], x23\n" + "fmla v14.4s, v27.4s, v1.s[1]\n" + "mov v28.d[1], x22\n" + "fmla v20.4s, v27.4s, v2.s[1]\n" + "ldr d27, [x15, #0xa0]\n" + "fmla v9.4s, v26.4s, v0.s[1]\n" + "mov v27.d[1], x21\n" + "fmla v15.4s, v26.4s, v1.s[1]\n" + "ldr x23, [x15, #0xc8]\n" + "fmla v21.4s, v26.4s, v2.s[1]\n" + "ldr d26, [x15, #0xb0]\n" + "mov v26.d[1], x20\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v16.4s, v29.4s, v1.s[1]\n" + "ldr x22, [x15, #0xd8]\n" + "fmla v22.4s, v29.4s, v2.s[1]\n" + "ldr d29, [x15, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "ldr x21, [x15, #0xe8]\n" + "fmla v17.4s, v28.4s, v1.s[1]\n" + "ldr x20, [x15, #0xf8]\n" + "fmla v23.4s, v28.4s, v2.s[1]\n" + "ldr d28, [x15, #0xd0]\n" + "fmla v12.4s, v27.4s, v0.s[1]\n" + "mov v29.d[1], x23\n" + "fmla v18.4s, v27.4s, v1.s[1]\n" + "mov v28.d[1], x22\n" + "fmla v24.4s, v27.4s, v2.s[1]\n" + "ldr d27, [x15, #0xe0]\n" + "fmla v13.4s, v26.4s, v0.s[1]\n" + "mov v27.d[1], x21\n" + "fmla v19.4s, v26.4s, v1.s[1]\n" + "ldr x23, [x15, #0x108]\n" + "fmla v25.4s, v26.4s, v2.s[1]\n" + "ldr d26, [x15, #0xf0]\n" + "mov v26.d[1], x20\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "ldr x22, [x15, #0x118]\n" + "fmla v20.4s, v29.4s, v2.s[2]\n" + "ldr d29, [x15, #0x100]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "ldr x21, [x15, #0x128]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "ldr x20, [x15, #0x138]\n" + "fmla v21.4s, v28.4s, v2.s[2]\n" + "ldr d28, [x15, #0x110]\n" + "fmla v10.4s, v27.4s, v0.s[2]\n" + "mov v29.d[1], x23\n" + "fmla v16.4s, v27.4s, v1.s[2]\n" + "mov v28.d[1], x22\n" + "fmla v22.4s, v27.4s, v2.s[2]\n" + "ldr d27, [x15, #0x120]\n" + "fmla v11.4s, v26.4s, v0.s[2]\n" + "mov v27.d[1], x21\n" + "fmla v17.4s, v26.4s, v1.s[2]\n" + "ldr x23, [x15, #0x148]\n" + "fmla v23.4s, v26.4s, v2.s[2]\n" + "ldr d26, [x15, #0x130]\n" + "mov v26.d[1], x20\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v18.4s, v29.4s, v1.s[2]\n" + "ldr x22, [x15, #0x158]\n" + "fmla v24.4s, v29.4s, v2.s[2]\n" + "ldr d29, [x15, #0x140]\n" + "fmla v13.4s, v28.4s, v0.s[2]\n" + "ldr x21, [x15, #0x168]\n" + "fmla v19.4s, v28.4s, v1.s[2]\n" + "ldr x20, [x15, #0x178]\n" + "fmla v25.4s, v28.4s, v2.s[2]\n" + "ldr d28, [x15, #0x150]\n" + "fmla v8.4s, v27.4s, v0.s[3]\n" + "mov v29.d[1], x23\n" + "fmla v14.4s, v27.4s, v1.s[3]\n" + "mov v28.d[1], x22\n" + "fmla v20.4s, v27.4s, v2.s[3]\n" + "ldr d27, [x15, #0x160]\n" + "fmla v9.4s, v26.4s, v0.s[3]\n" + "mov v27.d[1], x21\n" + "fmla v15.4s, v26.4s, v1.s[3]\n" "add x11, x11, #0x10\n" - "fmla v21.4s, v7.4s, v2.s[3]\n" - "ldr d7, [x15, #0x170]\n" - "mov v7.d[1], x27\n" - "add x25, x25, #0x10\n" - "add x23, x23, #0x10\n" + "fmla v21.4s, v26.4s, v2.s[3]\n" + "ldr d26, [x15, #0x170]\n" + "mov v26.d[1], x20\n" + "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" "add x15, x15, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "ldr x10, [x15, #0x8]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "ldr x9, [x15, #0x18]\n" - "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "ldr x26, [x15, #0x8]\n" + "fmla v16.4s, v29.4s, v1.s[3]\n" + "ldr x25, [x15, #0x18]\n" + "fmla v22.4s, v29.4s, v2.s[3]\n" "ldr d4, [x15, #0x0]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "ldr x28, [x15, #0x28]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "ldr x26, [x11, #0x8]\n" - "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "ldr x24, [x15, #0x28]\n" + "fmla v17.4s, v28.4s, v1.s[3]\n" + "ldr x23, [x11, #0x8]\n" + "fmla v23.4s, v28.4s, v2.s[3]\n" "ldr d5, [x15, #0x10]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "ldr x24, [x25, #0x8]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "ldr x22, [x23, #0x8]\n" - "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v12.4s, v27.4s, v0.s[3]\n" + "ldr x22, [x10, #0x8]\n" + "fmla v18.4s, v27.4s, v1.s[3]\n" + "ldr x21, [x9, #0x8]\n" + "fmla v24.4s, v27.4s, v2.s[3]\n" "ldr d6, [x15, #0x20]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v26.4s, v0.s[3]\n" "ldr d0, [x11, #0x0]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x25, #0x0]\n" - "fmla v25.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x23, #0x0]\n" + "fmla v19.4s, v26.4s, v1.s[3]\n" + "ldr d1, [x10, #0x0]\n" + "fmla v25.4s, v26.4s, v2.s[3]\n" + "ldr d2, [x9, #0x0]\n" "ldr d7, [x15, #0x30]\n" "sub x12, x12, #0x4\n" - "ldr x27, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" "cmp x12, #0x8\n" "prfm pldl1keep, [x11, #0x80]\n" - "mov v4.d[1], x10\n" - "prfm pldl1keep, [x25, #0x80]\n" - "mov v5.d[1], x9\n" - "prfm pldl1keep, [x23, #0x80]\n" - "mov v6.d[1], x28\n" - "mov v0.d[1], x26\n" - "mov v1.d[1], x24\n" - "mov v2.d[1], x22\n" - "mov v7.d[1], x27\n" + "mov v4.d[1], x26\n" + "prfm pldl1keep, [x10, #0x80]\n" + "mov v5.d[1], x25\n" + "prfm pldl1keep, [x9, #0x80]\n" + "mov v6.d[1], x24\n" + "mov v0.d[1], x23\n" + "mov v1.d[1], x22\n" + "mov v2.d[1], x21\n" + "mov v7.d[1], x20\n" "bge 104b\n" "105:" // Height 3: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" "add x11, x11, #0x10\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x15, #0x40]\n" + "ldr q29, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v15.4s, v5.4s, v1.s[0]\n" "sub x12, x12, #0x4\n" "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x15, #0x50]\n" + "ldr q28, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "prfm pldl1keep, [x11, #0x80]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v22.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x15, #0x60]\n" + "ldr q27, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" "fmla v23.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x15, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x15, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x15, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v20.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v21.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "fmla v22.4s, v4.4s, v2.s[1]\n" - "ldr q4, [x15, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "fmla v23.4s, v5.4s, v2.s[1]\n" - "ldr q5, [x15, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "fmla v24.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x15, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "fmla v25.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x15, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "fmla v20.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x15, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "fmla v21.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x15, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "fmla v22.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x15, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "fmla v23.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x15, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "fmla v24.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x15, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "fmla v25.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x15, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v20.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x15, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v21.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x15, #0x170]\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" + "ldr q26, [x15, #0x70]\n" + "fmla v12.4s, v29.4s, v0.s[0]\n" + "fmla v18.4s, v29.4s, v1.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[0]\n" + "ldr q29, [x15, #0x80]\n" + "fmla v13.4s, v28.4s, v0.s[0]\n" + "fmla v19.4s, v28.4s, v1.s[0]\n" + "fmla v25.4s, v28.4s, v2.s[0]\n" + "ldr q28, [x15, #0x90]\n" + "fmla v8.4s, v27.4s, v0.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[1]\n" + "fmla v20.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x15, #0xa0]\n" + "fmla v9.4s, v26.4s, v0.s[1]\n" + "fmla v15.4s, v26.4s, v1.s[1]\n" + "fmla v21.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x15, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v16.4s, v29.4s, v1.s[1]\n" + "fmla v22.4s, v29.4s, v2.s[1]\n" + "ldr q29, [x15, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v17.4s, v28.4s, v1.s[1]\n" + "fmla v23.4s, v28.4s, v2.s[1]\n" + "ldr q28, [x15, #0xd0]\n" + "fmla v12.4s, v27.4s, v0.s[1]\n" + "fmla v18.4s, v27.4s, v1.s[1]\n" + "fmla v24.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x15, #0xe0]\n" + "fmla v13.4s, v26.4s, v0.s[1]\n" + "fmla v19.4s, v26.4s, v1.s[1]\n" + "fmla v25.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x15, #0xf0]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v20.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x15, #0x100]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v21.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x15, #0x110]\n" + "fmla v10.4s, v27.4s, v0.s[2]\n" + "fmla v16.4s, v27.4s, v1.s[2]\n" + "fmla v22.4s, v27.4s, v2.s[2]\n" + "ldr q27, [x15, #0x120]\n" + "fmla v11.4s, v26.4s, v0.s[2]\n" + "fmla v17.4s, v26.4s, v1.s[2]\n" + "fmla v23.4s, v26.4s, v2.s[2]\n" + "ldr q26, [x15, #0x130]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v18.4s, v29.4s, v1.s[2]\n" + "fmla v24.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x15, #0x140]\n" + "fmla v13.4s, v28.4s, v0.s[2]\n" + "fmla v19.4s, v28.4s, v1.s[2]\n" + "fmla v25.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x15, #0x150]\n" + "fmla v8.4s, v27.4s, v0.s[3]\n" + "fmla v14.4s, v27.4s, v1.s[3]\n" + "fmla v20.4s, v27.4s, v2.s[3]\n" + "ldr q27, [x15, #0x160]\n" + "fmla v9.4s, v26.4s, v0.s[3]\n" + "fmla v15.4s, v26.4s, v1.s[3]\n" + "fmla v21.4s, v26.4s, v2.s[3]\n" + "ldr q26, [x15, #0x170]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" "add x15, x15, #0x180\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "fmla v22.4s, v4.4s, v2.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "fmla v23.4s, v5.4s, v2.s[3]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "fmla v24.4s, v6.4s, v2.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" - "fmla v25.4s, v7.4s, v2.s[3]\n" + "fmla v16.4s, v29.4s, v1.s[3]\n" + "fmla v22.4s, v29.4s, v2.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v17.4s, v28.4s, v1.s[3]\n" + "fmla v23.4s, v28.4s, v2.s[3]\n" + "fmla v12.4s, v27.4s, v0.s[3]\n" + "fmla v18.4s, v27.4s, v1.s[3]\n" + "fmla v24.4s, v27.4s, v2.s[3]\n" + "fmla v13.4s, v26.4s, v0.s[3]\n" + "fmla v19.4s, v26.4s, v1.s[3]\n" + "fmla v25.4s, v26.4s, v2.s[3]\n" "106:" // Height 3: Multiply loop: Main loop skip "cbz x12, 108f\n" "107:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x11], #0x4\n" "sub x12, x12, #0x1\n" - "ldr s1, [x25], #0x4\n" - "ldr s2, [x23], #0x4\n" - "ldr q4, [x15, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x15, #0x10]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr q7, [x15, #0x30]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q4, [x15, #0x40]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x15, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s31, [x10], #0x4\n" + "ldr s30, [x9], #0x4\n" + "ldr q27, [x15, #0x0]\n" + "fmla v8.4s, v27.4s, v0.s[0]\n" + "ldr q26, [x15, #0x10]\n" + "fmla v14.4s, v27.4s, v31.s[0]\n" + "ldr q29, [x15, #0x20]\n" + "fmla v20.4s, v27.4s, v30.s[0]\n" + "ldr q28, [x15, #0x30]\n" + "fmla v9.4s, v26.4s, v0.s[0]\n" + "ldr q27, [x15, #0x40]\n" + "fmla v15.4s, v26.4s, v31.s[0]\n" + "fmla v21.4s, v26.4s, v30.s[0]\n" + "ldr q26, [x15, #0x50]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" "add x15, x15, #0x60\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" - "fmla v22.4s, v6.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" - "fmla v23.4s, v7.4s, v2.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v16.4s, v29.4s, v31.s[0]\n" + "fmla v22.4s, v29.4s, v30.s[0]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v17.4s, v28.4s, v31.s[0]\n" + "fmla v23.4s, v28.4s, v30.s[0]\n" + "fmla v12.4s, v27.4s, v0.s[0]\n" + "fmla v18.4s, v27.4s, v31.s[0]\n" + "fmla v24.4s, v27.4s, v30.s[0]\n" + "fmla v13.4s, v26.4s, v0.s[0]\n" + "fmla v19.4s, v26.4s, v31.s[0]\n" + "fmla v25.4s, v26.4s, v30.s[0]\n" "cbnz x12, 107b\n" "108:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1679,45 +1678,45 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 109f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" - "fmin v16.4s, v16.4s, v0.4s\n" - "fmin v17.4s, v17.4s, v0.4s\n" - "fmin v18.4s, v18.4s, v0.4s\n" - "fmin v19.4s, v19.4s, v0.4s\n" - "fmin v20.4s, v20.4s, v0.4s\n" - "fmin v21.4s, v21.4s, v0.4s\n" - "fmin v22.4s, v22.4s, v0.4s\n" - "fmin v23.4s, v23.4s, v0.4s\n" - "fmin v24.4s, v24.4s, v0.4s\n" - "fmin v25.4s, v25.4s, v0.4s\n" + "ld1r { v26.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v21.4s, v21.4s, v26.4s\n" + "fmin v22.4s, v22.4s, v26.4s\n" + "fmin v23.4s, v23.4s, v26.4s\n" + "fmin v24.4s, v24.4s, v26.4s\n" + "fmin v25.4s, v25.4s, v26.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" + "ld1r { v26.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v26.4s\n" + "fmax v9.4s, v9.4s, v26.4s\n" + "fmax v10.4s, v10.4s, v26.4s\n" + "fmax v11.4s, v11.4s, v26.4s\n" + "fmax v12.4s, v12.4s, v26.4s\n" + "fmax v13.4s, v13.4s, v26.4s\n" + "fmax v14.4s, v14.4s, v26.4s\n" + "fmax v15.4s, v15.4s, v26.4s\n" + "fmax v16.4s, v16.4s, v26.4s\n" + "fmax v17.4s, v17.4s, v26.4s\n" + "fmax v18.4s, v18.4s, v26.4s\n" + "fmax v19.4s, v19.4s, v26.4s\n" + "fmax v20.4s, v20.4s, v26.4s\n" + "fmax v21.4s, v21.4s, v26.4s\n" + "fmax v22.4s, v22.4s, v26.4s\n" + "fmax v23.4s, v23.4s, v26.4s\n" + "fmax v24.4s, v24.4s, v26.4s\n" + "fmax v25.4s, v25.4s, v26.4s\n" "109:" // Height 3: No activation "cmp x16, #0x18\n" "bge 122f\n" @@ -2139,34 +2138,34 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "142:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w12, [x20, x13, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 143f\n" - "ldr x21, [%x[input_ptr], x13, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x11, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x23, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x11, [x20, #0x0]\n" + "ldr x10, [x20, #0x8]\n" + "ldr x9, [x20, #0x10]\n" + "ldr x28, [x20, #0x18]\n" "cbnz x13, 144f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x11, x11, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x21, x21, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" + "add x9, x9, x20, LSL #2\n" + "add x28, x28, x20, LSL #2\n" "b 144f\n" "143:" // Height 4: setup direct input "mov x11, %x[input_ptr]\n" - "add x25, x11, x20, LSL #2\n" - "add x23, x25, x20, LSL #2\n" - "add x21, x23, x20, LSL #2\n" + "add x10, x11, x21, LSL #2\n" + "add x9, x10, x21, LSL #2\n" + "add x28, x9, x21, LSL #2\n" "144:" // Height 4: input setup done "cmp x12, #0x4\n" "blt 147f\n" "ldr q0, [x11, #0x0]\n" "cmp x12, #0x8\n" - "ldr q1, [x25, #0x0]\n" - "ldr q2, [x23, #0x0]\n" - "ldr q3, [x21, #0x0]\n" + "ldr q1, [x10, #0x0]\n" + "ldr q2, [x9, #0x0]\n" + "ldr q3, [x28, #0x0]\n" "ldr q4, [x15, #0x0]\n" "ldr q5, [x15, #0x10]\n" "ldr q6, [x15, #0x20]\n" @@ -2174,177 +2173,177 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "blt 146f\n" "145:" // Height 4: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr x10, [x15, #0x48]\n" + "ldr x23, [x15, #0x48]\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr x9, [x15, #0x58]\n" + "ldr x22, [x15, #0x58]\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr x28, [x15, #0x68]\n" + "ldr x21, [x15, #0x68]\n" "fmla v26.4s, v4.4s, v3.s[0]\n" "ldr d4, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr x27, [x15, #0x78]\n" + "ldr x20, [x15, #0x78]\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr x10, [x15, #0x88]\n" + "ldr x23, [x15, #0x88]\n" "fmla v27.4s, v5.4s, v3.s[0]\n" "ldr d5, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "ldr x9, [x15, #0x98]\n" + "ldr x22, [x15, #0x98]\n" "fmla v22.4s, v6.4s, v2.s[0]\n" "add x11, x11, #0x10\n" "fmla v28.4s, v6.4s, v3.s[0]\n" "ldr d6, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x28\n" + "mov v6.d[1], x21\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr x28, [x15, #0xa8]\n" + "ldr x21, [x15, #0xa8]\n" "fmla v23.4s, v7.4s, v2.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v29.4s, v7.4s, v3.s[0]\n" "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x27\n" + "mov v7.d[1], x20\n" "fmla v12.4s, v4.4s, v0.s[0]\n" "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr x27, [x15, #0xb8]\n" + "ldr x20, [x15, #0xb8]\n" "fmla v24.4s, v4.4s, v2.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v30.4s, v4.4s, v3.s[0]\n" "ldr d4, [x15, #0x80]\n" "fmla v13.4s, v5.4s, v0.s[0]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr x10, [x15, #0xc8]\n" + "ldr x23, [x15, #0xc8]\n" "fmla v25.4s, v5.4s, v2.s[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v31.4s, v5.4s, v3.s[0]\n" "ldr d5, [x15, #0x90]\n" "fmla v8.4s, v6.4s, v0.s[1]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr x9, [x15, #0xd8]\n" + "ldr x22, [x15, #0xd8]\n" "fmla v20.4s, v6.4s, v2.s[1]\n" - "ldr x26, [x11, #0x8]\n" + "ldr x27, [x11, #0x8]\n" "fmla v26.4s, v6.4s, v3.s[1]\n" "ldr d6, [x15, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x28\n" + "mov v6.d[1], x21\n" "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x28, [x15, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" "fmla v21.4s, v7.4s, v2.s[1]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x26, [x10, #0x8]\n" "fmla v27.4s, v7.4s, v3.s[1]\n" "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x27\n" + "mov v7.d[1], x20\n" "fmla v10.4s, v4.4s, v0.s[1]\n" "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr x27, [x15, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" "fmla v22.4s, v4.4s, v2.s[1]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x25, [x9, #0x8]\n" "fmla v28.4s, v4.4s, v3.s[1]\n" "ldr d4, [x15, #0xc0]\n" "fmla v11.4s, v5.4s, v0.s[1]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr x10, [x15, #0x108]\n" + "ldr x23, [x15, #0x108]\n" "fmla v23.4s, v5.4s, v2.s[1]\n" - "ldr x20, [x21, #0x8]\n" + "ldr x24, [x28, #0x8]\n" "fmla v29.4s, v5.4s, v3.s[1]\n" "ldr d5, [x15, #0xd0]\n" "fmla v12.4s, v6.4s, v0.s[1]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v18.4s, v6.4s, v1.s[1]\n" - "ldr x9, [x15, #0x118]\n" + "ldr x22, [x15, #0x118]\n" "fmla v24.4s, v6.4s, v2.s[1]\n" "sub x12, x12, #0x4\n" "fmla v30.4s, v6.4s, v3.s[1]\n" "ldr d6, [x15, #0xe0]\n" "fmla v13.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x28\n" + "mov v6.d[1], x21\n" "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr x28, [x15, #0x128]\n" + "ldr x21, [x15, #0x128]\n" "fmla v25.4s, v7.4s, v2.s[1]\n" "cmp x12, #0x8\n" "fmla v31.4s, v7.4s, v3.s[1]\n" "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x27\n" + "mov v7.d[1], x20\n" "fmla v8.4s, v4.4s, v0.s[2]\n" "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr x27, [x15, #0x138]\n" + "ldr x20, [x15, #0x138]\n" "fmla v20.4s, v4.4s, v2.s[2]\n" "prfm pldl1keep, [x11, #0x80]\n" "fmla v26.4s, v4.4s, v3.s[2]\n" "ldr d4, [x15, #0x100]\n" "fmla v9.4s, v5.4s, v0.s[2]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr x10, [x15, #0x148]\n" + "ldr x23, [x15, #0x148]\n" "fmla v21.4s, v5.4s, v2.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v27.4s, v5.4s, v3.s[2]\n" "ldr d5, [x15, #0x110]\n" "fmla v10.4s, v6.4s, v0.s[2]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v16.4s, v6.4s, v1.s[2]\n" - "ldr x9, [x15, #0x158]\n" + "ldr x22, [x15, #0x158]\n" "fmla v22.4s, v6.4s, v2.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v28.4s, v6.4s, v3.s[2]\n" "ldr d6, [x15, #0x120]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x28\n" + "mov v6.d[1], x21\n" "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr x28, [x15, #0x168]\n" + "ldr x21, [x15, #0x168]\n" "fmla v23.4s, v7.4s, v2.s[2]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v29.4s, v7.4s, v3.s[2]\n" "ldr d7, [x15, #0x130]\n" - "mov v7.d[1], x27\n" + "mov v7.d[1], x20\n" "fmla v12.4s, v4.4s, v0.s[2]\n" "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr x27, [x15, #0x178]\n" + "ldr x20, [x15, #0x178]\n" "fmla v24.4s, v4.4s, v2.s[2]\n" "fmla v30.4s, v4.4s, v3.s[2]\n" "ldr d4, [x15, #0x140]\n" "fmla v13.4s, v5.4s, v0.s[2]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v19.4s, v5.4s, v1.s[2]\n" "fmla v25.4s, v5.4s, v2.s[2]\n" "fmla v31.4s, v5.4s, v3.s[2]\n" "ldr d5, [x15, #0x150]\n" "fmla v8.4s, v6.4s, v0.s[3]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v14.4s, v6.4s, v1.s[3]\n" "fmla v20.4s, v6.4s, v2.s[3]\n" "fmla v26.4s, v6.4s, v3.s[3]\n" "ldr d6, [x15, #0x160]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x28\n" + "mov v6.d[1], x21\n" "fmla v15.4s, v7.4s, v1.s[3]\n" "fmla v21.4s, v7.4s, v2.s[3]\n" "fmla v27.4s, v7.4s, v3.s[3]\n" "ldr d7, [x15, #0x170]\n" - "mov v7.d[1], x27\n" + "mov v7.d[1], x20\n" "add x15, x15, #0x180\n" "fmla v10.4s, v4.4s, v0.s[3]\n" - "ldr x10, [x15, #0x8]\n" + "ldr x23, [x15, #0x8]\n" "fmla v16.4s, v4.4s, v1.s[3]\n" - "ldr x9, [x15, #0x18]\n" + "ldr x22, [x15, #0x18]\n" "fmla v22.4s, v4.4s, v2.s[3]\n" - "ldr x28, [x15, #0x28]\n" + "ldr x21, [x15, #0x28]\n" "fmla v28.4s, v4.4s, v3.s[3]\n" "ldr d4, [x15, #0x0]\n" "fmla v11.4s, v5.4s, v0.s[3]\n" - "ldr x27, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" "fmla v17.4s, v5.4s, v1.s[3]\n" - "mov v4.d[1], x10\n" + "mov v4.d[1], x23\n" "fmla v23.4s, v5.4s, v2.s[3]\n" "fmla v29.4s, v5.4s, v3.s[3]\n" "ldr d5, [x15, #0x10]\n" "fmla v12.4s, v6.4s, v0.s[3]\n" - "mov v5.d[1], x9\n" + "mov v5.d[1], x22\n" "fmla v18.4s, v6.4s, v1.s[3]\n" "fmla v24.4s, v6.4s, v2.s[3]\n" "fmla v30.4s, v6.4s, v3.s[3]\n" @@ -2352,30 +2351,30 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "fmla v13.4s, v7.4s, v0.s[3]\n" "ldr d0, [x11, #0x0]\n" "fmla v19.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x25, #0x0]\n" + "ldr d1, [x10, #0x0]\n" "fmla v25.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x23, #0x0]\n" + "ldr d2, [x9, #0x0]\n" "fmla v31.4s, v7.4s, v3.s[3]\n" - "ldr d3, [x21, #0x0]\n" + "ldr d3, [x28, #0x0]\n" "ldr d7, [x15, #0x30]\n" - "mov v6.d[1], x28\n" - "mov v0.d[1], x26\n" - "mov v1.d[1], x24\n" - "mov v2.d[1], x22\n" - "mov v3.d[1], x20\n" - "mov v7.d[1], x27\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x26\n" + "mov v2.d[1], x25\n" + "mov v3.d[1], x24\n" + "mov v7.d[1], x20\n" "bge 145b\n" "146:" // Height 4: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" "add x11, x11, #0x10\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v26.4s, v4.4s, v3.s[0]\n" "ldr q4, [x15, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v15.4s, v5.4s, v1.s[0]\n" "sub x12, x12, #0x4\n" "fmla v21.4s, v5.4s, v2.s[0]\n" @@ -2383,11 +2382,11 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "fmla v27.4s, v5.4s, v3.s[0]\n" "ldr q5, [x15, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v22.4s, v6.4s, v2.s[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v28.4s, v6.4s, v3.s[0]\n" "ldr q6, [x15, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" @@ -2495,42 +2494,42 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "147:" // Height 4: Multiply loop: Main loop skip "cbz x12, 149f\n" "148:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" + "ldr s7, [x11], #0x4\n" "sub x12, x12, #0x1\n" - "ldr s1, [x25], #0x4\n" - "ldr s2, [x23], #0x4\n" - "ldr s3, [x21], #0x4\n" - "ldr q4, [x15, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x15, #0x10]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q6, [x15, #0x20]\n" - "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr q7, [x15, #0x30]\n" - "fmla v26.4s, v4.4s, v3.s[0]\n" - "ldr q4, [x15, #0x40]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "fmla v21.4s, v5.4s, v2.s[0]\n" - "fmla v27.4s, v5.4s, v3.s[0]\n" - "ldr q5, [x15, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s6, [x10], #0x4\n" + "ldr s5, [x9], #0x4\n" + "ldr s4, [x28], #0x4\n" + "ldr q1, [x15, #0x0]\n" + "fmla v8.4s, v1.4s, v7.s[0]\n" + "ldr q0, [x15, #0x10]\n" + "fmla v14.4s, v1.4s, v6.s[0]\n" + "ldr q3, [x15, #0x20]\n" + "fmla v20.4s, v1.4s, v5.s[0]\n" + "ldr q2, [x15, #0x30]\n" + "fmla v26.4s, v1.4s, v4.s[0]\n" + "ldr q1, [x15, #0x40]\n" + "fmla v9.4s, v0.4s, v7.s[0]\n" + "fmla v15.4s, v0.4s, v6.s[0]\n" + "fmla v21.4s, v0.4s, v5.s[0]\n" + "fmla v27.4s, v0.4s, v4.s[0]\n" + "ldr q0, [x15, #0x50]\n" + "fmla v10.4s, v3.4s, v7.s[0]\n" "add x15, x15, #0x60\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" - "fmla v22.4s, v6.4s, v2.s[0]\n" - "fmla v28.4s, v6.4s, v3.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" - "fmla v23.4s, v7.4s, v2.s[0]\n" - "fmla v29.4s, v7.4s, v3.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "fmla v30.4s, v4.4s, v3.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "fmla v31.4s, v5.4s, v3.s[0]\n" + "fmla v16.4s, v3.4s, v6.s[0]\n" + "fmla v22.4s, v3.4s, v5.s[0]\n" + "fmla v28.4s, v3.4s, v4.s[0]\n" + "fmla v11.4s, v2.4s, v7.s[0]\n" + "fmla v17.4s, v2.4s, v6.s[0]\n" + "fmla v23.4s, v2.4s, v5.s[0]\n" + "fmla v29.4s, v2.4s, v4.s[0]\n" + "fmla v12.4s, v1.4s, v7.s[0]\n" + "fmla v18.4s, v1.4s, v6.s[0]\n" + "fmla v24.4s, v1.4s, v5.s[0]\n" + "fmla v30.4s, v1.4s, v4.s[0]\n" + "fmla v13.4s, v0.4s, v7.s[0]\n" + "fmla v19.4s, v0.4s, v6.s[0]\n" + "fmla v25.4s, v0.4s, v5.s[0]\n" + "fmla v31.4s, v0.4s, v4.s[0]\n" "cbnz x12, 148b\n" "149:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2796,7 +2795,6 @@ void a64_hybrid_fp32_mla_4x24_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "166:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp index 5fb71c95b7..dbd45460e8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_4x24 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 124f\n" @@ -223,11 +222,11 @@ void a64_hybrid_fp32_mla_4x24 ( "19:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -246,126 +245,126 @@ void a64_hybrid_fp32_mla_4x24 ( "blt 23f\n" "22:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q19, [x28, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q18, [x28, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q17, [x28, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x28, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x28, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x28, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "ldr q16, [x28, #0x70]\n" + "fmla v12.4s, v19.4s, v0.s[0]\n" + "ldr q19, [x28, #0x80]\n" + "fmla v13.4s, v18.4s, v0.s[0]\n" + "ldr q18, [x28, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x28, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x28, #0xb0]\n" + "fmla v10.4s, v19.4s, v0.s[1]\n" + "ldr q19, [x28, #0xc0]\n" + "fmla v11.4s, v18.4s, v0.s[1]\n" + "ldr q18, [x28, #0xd0]\n" + "fmla v12.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x28, #0xe0]\n" + "fmla v13.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x28, #0xf0]\n" + "fmla v8.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x28, #0x100]\n" + "fmla v9.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x28, #0x110]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x28, #0x120]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x28, #0x130]\n" + "fmla v12.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x28, #0x140]\n" + "fmla v13.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x28, #0x150]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x28, #0x160]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x28, #0x170]\n" "sub x25, x25, #0x4\n" "add x24, x24, #0x10\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v10.4s, v19.4s, v0.s[3]\n" + "fmla v11.4s, v18.4s, v0.s[3]\n" "cmp x25, #0x8\n" "add x28, x28, #0x180\n" "ldr q4, [x28, #0x0]\n" "ldr q5, [x28, #0x10]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v0.s[3]\n" "ldr q6, [x28, #0x20]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v0.s[3]\n" "ldr q0, [x24, #0x0]\n" "ldr q7, [x28, #0x30]\n" "prfm pldl1keep, [x24, #0x80]\n" "bge 22b\n" "23:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q19, [x28, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q18, [x28, #0x50]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q17, [x28, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x28, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr q4, [x28, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "ldr q5, [x28, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "ldr q16, [x28, #0x70]\n" + "fmla v12.4s, v19.4s, v0.s[0]\n" + "ldr q19, [x28, #0x80]\n" + "fmla v13.4s, v18.4s, v0.s[0]\n" + "ldr q18, [x28, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x28, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x28, #0xb0]\n" + "fmla v10.4s, v19.4s, v0.s[1]\n" + "ldr q19, [x28, #0xc0]\n" + "fmla v11.4s, v18.4s, v0.s[1]\n" + "ldr q18, [x28, #0xd0]\n" + "fmla v12.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x28, #0xe0]\n" + "fmla v13.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x28, #0xf0]\n" + "fmla v8.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x28, #0x100]\n" + "fmla v9.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x28, #0x110]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x28, #0x120]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x28, #0x130]\n" + "fmla v12.4s, v19.4s, v0.s[2]\n" + "ldr q19, [x28, #0x140]\n" + "fmla v13.4s, v18.4s, v0.s[2]\n" + "ldr q18, [x28, #0x150]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x28, #0x160]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x28, #0x170]\n" "add x24, x24, #0x10\n" "sub x25, x25, #0x4\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" + "fmla v10.4s, v19.4s, v0.s[3]\n" + "fmla v11.4s, v18.4s, v0.s[3]\n" "prfm pldl1keep, [x24, #0x80]\n" "add x28, x28, #0x180\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v0.s[3]\n" "24:" // Height 1: Multiply loop: Main loop skip "cbz x25, 26f\n" "25:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x24], #0x4\n" - "ldr q4, [x28, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" + "ldr s18, [x24], #0x4\n" + "ldr q16, [x28, #0x0]\n" + "fmla v8.4s, v16.4s, v18.s[0]\n" "sub x25, x25, #0x1\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q4, [x28, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "ldr q5, [x28, #0x50]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" + "ldr q17, [x28, #0x10]\n" + "ldr q16, [x28, #0x20]\n" + "fmla v9.4s, v17.4s, v18.s[0]\n" + "fmla v10.4s, v16.4s, v18.s[0]\n" + "ldr q17, [x28, #0x30]\n" + "ldr q16, [x28, #0x40]\n" + "fmla v11.4s, v17.4s, v18.s[0]\n" + "fmla v12.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x28, #0x50]\n" + "fmla v13.4s, v16.4s, v18.s[0]\n" "add x28, x28, #0x60\n" "cbnz x25, 25b\n" "26:" // Height 1: Multiply loop: No odd multiplies @@ -376,21 +375,21 @@ void a64_hybrid_fp32_mla_4x24 ( "prfm pstl1keep, [x27, #0x0]\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmin v12.4s, v12.4s, v17.4s\n" + "fmin v13.4s, v13.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" "27:" // Height 1: No activation "cmp x9, #0x18\n" "bge 40f\n" @@ -651,12 +650,12 @@ void a64_hybrid_fp32_mla_4x24 ( "60:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -664,7 +663,7 @@ void a64_hybrid_fp32_mla_4x24 ( "b 62f\n" "61:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "62:" // Height 2: input setup done "cmp x25, #0x4\n" "blt 65f\n" @@ -679,186 +678,186 @@ void a64_hybrid_fp32_mla_4x24 ( "63:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v4.4s, v0.s[0]\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q23, [x28, #0x40]\n" "sub x25, x25, #0x4\n" "fmla v9.4s, v5.4s, v0.s[0]\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q22, [x28, #0x50]\n" "add x24, x24, #0x10\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q21, [x28, #0x60]\n" "add x23, x23, #0x10\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x28, #0x70]\n" + "ldr q20, [x28, #0x70]\n" "cmp x25, #0x8\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x28, #0x80]\n" + "fmla v12.4s, v23.4s, v0.s[0]\n" + "fmla v18.4s, v23.4s, v1.s[0]\n" + "ldr q23, [x28, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x28, #0x90]\n" + "fmla v13.4s, v22.4s, v0.s[0]\n" + "fmla v19.4s, v22.4s, v1.s[0]\n" + "ldr q22, [x28, #0x90]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x28, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x28, #0xb0]\n" + "fmla v10.4s, v23.4s, v0.s[1]\n" + "fmla v16.4s, v23.4s, v1.s[1]\n" + "ldr q23, [x28, #0xc0]\n" + "fmla v11.4s, v22.4s, v0.s[1]\n" + "fmla v17.4s, v22.4s, v1.s[1]\n" + "ldr q22, [x28, #0xd0]\n" + "fmla v12.4s, v21.4s, v0.s[1]\n" + "fmla v18.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x28, #0xe0]\n" + "fmla v13.4s, v20.4s, v0.s[1]\n" + "fmla v19.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x28, #0xf0]\n" + "fmla v8.4s, v23.4s, v0.s[2]\n" + "fmla v14.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x28, #0x100]\n" + "fmla v9.4s, v22.4s, v0.s[2]\n" + "fmla v15.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x28, #0x110]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v16.4s, v21.4s, v1.s[2]\n" + "ldr q21, [x28, #0x120]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v17.4s, v20.4s, v1.s[2]\n" + "ldr q20, [x28, #0x130]\n" + "fmla v12.4s, v23.4s, v0.s[2]\n" + "fmla v18.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x28, #0x140]\n" + "fmla v13.4s, v22.4s, v0.s[2]\n" + "fmla v19.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x28, #0x150]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "ldr q21, [x28, #0x160]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "ldr q20, [x28, #0x170]\n" "add x28, x28, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" + "fmla v10.4s, v23.4s, v0.s[3]\n" + "fmla v16.4s, v23.4s, v1.s[3]\n" "ldr q4, [x28, #0x0]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" + "fmla v11.4s, v22.4s, v0.s[3]\n" + "fmla v17.4s, v22.4s, v1.s[3]\n" "ldr q5, [x28, #0x10]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[3]\n" + "fmla v18.4s, v21.4s, v1.s[3]\n" "ldr q6, [x28, #0x20]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v20.4s, v0.s[3]\n" "ldr q0, [x24, #0x0]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v20.4s, v1.s[3]\n" "ldr q1, [x23, #0x0]\n" "ldr q7, [x28, #0x30]\n" "bge 63b\n" "64:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v4.4s, v0.s[0]\n" "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q23, [x28, #0x40]\n" "add x24, x24, #0x10\n" "fmla v9.4s, v5.4s, v0.s[0]\n" "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q22, [x28, #0x50]\n" "add x23, x23, #0x10\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q21, [x28, #0x60]\n" "sub x25, x25, #0x4\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x28, #0x70]\n" + "ldr q20, [x28, #0x70]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "ldr q4, [x28, #0x80]\n" + "fmla v12.4s, v23.4s, v0.s[0]\n" + "fmla v18.4s, v23.4s, v1.s[0]\n" + "ldr q23, [x28, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "ldr q5, [x28, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "fmla v13.4s, v22.4s, v0.s[0]\n" + "fmla v19.4s, v22.4s, v1.s[0]\n" + "ldr q22, [x28, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x28, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x28, #0xb0]\n" + "fmla v10.4s, v23.4s, v0.s[1]\n" + "fmla v16.4s, v23.4s, v1.s[1]\n" + "ldr q23, [x28, #0xc0]\n" + "fmla v11.4s, v22.4s, v0.s[1]\n" + "fmla v17.4s, v22.4s, v1.s[1]\n" + "ldr q22, [x28, #0xd0]\n" + "fmla v12.4s, v21.4s, v0.s[1]\n" + "fmla v18.4s, v21.4s, v1.s[1]\n" + "ldr q21, [x28, #0xe0]\n" + "fmla v13.4s, v20.4s, v0.s[1]\n" + "fmla v19.4s, v20.4s, v1.s[1]\n" + "ldr q20, [x28, #0xf0]\n" + "fmla v8.4s, v23.4s, v0.s[2]\n" + "fmla v14.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x28, #0x100]\n" + "fmla v9.4s, v22.4s, v0.s[2]\n" + "fmla v15.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x28, #0x110]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v16.4s, v21.4s, v1.s[2]\n" + "ldr q21, [x28, #0x120]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v17.4s, v20.4s, v1.s[2]\n" + "ldr q20, [x28, #0x130]\n" + "fmla v12.4s, v23.4s, v0.s[2]\n" + "fmla v18.4s, v23.4s, v1.s[2]\n" + "ldr q23, [x28, #0x140]\n" + "fmla v13.4s, v22.4s, v0.s[2]\n" + "fmla v19.4s, v22.4s, v1.s[2]\n" + "ldr q22, [x28, #0x150]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "ldr q21, [x28, #0x160]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "ldr q20, [x28, #0x170]\n" "add x28, x28, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v10.4s, v23.4s, v0.s[3]\n" + "fmla v16.4s, v23.4s, v1.s[3]\n" + "fmla v11.4s, v22.4s, v0.s[3]\n" + "fmla v17.4s, v22.4s, v1.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[3]\n" + "fmla v18.4s, v21.4s, v1.s[3]\n" + "fmla v13.4s, v20.4s, v0.s[3]\n" + "fmla v19.4s, v20.4s, v1.s[3]\n" "65:" // Height 2: Multiply loop: Main loop skip "cbz x25, 67f\n" "66:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x24], #0x4\n" - "ldr s1, [x23], #0x4\n" + "ldr s25, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" "sub x25, x25, #0x1\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "ldr q4, [x28, #0x40]\n" - "ldr q5, [x28, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" + "ldr q21, [x28, #0x0]\n" + "ldr q20, [x28, #0x10]\n" + "fmla v8.4s, v21.4s, v25.s[0]\n" + "fmla v14.4s, v21.4s, v24.s[0]\n" + "ldr q23, [x28, #0x20]\n" + "ldr q22, [x28, #0x30]\n" + "fmla v9.4s, v20.4s, v25.s[0]\n" + "fmla v15.4s, v20.4s, v24.s[0]\n" + "ldr q21, [x28, #0x40]\n" + "ldr q20, [x28, #0x50]\n" + "fmla v10.4s, v23.4s, v25.s[0]\n" + "fmla v16.4s, v23.4s, v24.s[0]\n" + "fmla v11.4s, v22.4s, v25.s[0]\n" + "fmla v17.4s, v22.4s, v24.s[0]\n" "add x28, x28, #0x60\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" + "fmla v12.4s, v21.4s, v25.s[0]\n" + "fmla v18.4s, v21.4s, v24.s[0]\n" + "fmla v13.4s, v20.4s, v25.s[0]\n" + "fmla v19.4s, v20.4s, v24.s[0]\n" "cbnz x25, 66b\n" "67:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -871,33 +870,33 @@ void a64_hybrid_fp32_mla_4x24 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 68f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v21.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v21.4s\n" + "fmin v9.4s, v9.4s, v21.4s\n" + "fmin v10.4s, v10.4s, v21.4s\n" + "fmin v11.4s, v11.4s, v21.4s\n" + "fmin v12.4s, v12.4s, v21.4s\n" + "fmin v13.4s, v13.4s, v21.4s\n" + "fmin v14.4s, v14.4s, v21.4s\n" + "fmin v15.4s, v15.4s, v21.4s\n" + "fmin v16.4s, v16.4s, v21.4s\n" + "fmin v17.4s, v17.4s, v21.4s\n" + "fmin v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "68:" // Height 2: No activation "cmp x9, #0x18\n" "bge 81f\n" @@ -1237,13 +1236,13 @@ void a64_hybrid_fp32_mla_4x24 ( "101:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 102f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 103f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1252,8 +1251,8 @@ void a64_hybrid_fp32_mla_4x24 ( "b 103f\n" "102:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "103:" // Height 3: input setup done "cmp x25, #0x4\n" "blt 106f\n" @@ -1272,107 +1271,107 @@ void a64_hybrid_fp32_mla_4x24 ( "sub x25, x25, #0x4\n" "add x24, x24, #0x10\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q29, [x28, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" "add x23, x23, #0x10\n" "fmla v15.4s, v5.4s, v1.s[0]\n" "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q28, [x28, #0x50]\n" "add x22, x22, #0x10\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" "cmp x25, #0x8\n" "prfm pldl1keep, [x24, #0x80]\n" "fmla v22.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q27, [x28, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "prfm pldl1keep, [x23, #0x80]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" "fmla v23.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x28, #0x70]\n" + "ldr q26, [x28, #0x70]\n" "prfm pldl1keep, [x22, #0x80]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x28, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x28, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v20.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v21.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "fmla v22.4s, v4.4s, v2.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "fmla v23.4s, v5.4s, v2.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "fmla v24.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "fmla v25.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "fmla v20.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "fmla v21.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "fmla v22.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "fmla v23.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "fmla v24.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "fmla v25.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v20.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v21.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "fmla v12.4s, v29.4s, v0.s[0]\n" + "fmla v18.4s, v29.4s, v1.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[0]\n" + "ldr q29, [x28, #0x80]\n" + "fmla v13.4s, v28.4s, v0.s[0]\n" + "fmla v19.4s, v28.4s, v1.s[0]\n" + "fmla v25.4s, v28.4s, v2.s[0]\n" + "ldr q28, [x28, #0x90]\n" + "fmla v8.4s, v27.4s, v0.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[1]\n" + "fmla v20.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x28, #0xa0]\n" + "fmla v9.4s, v26.4s, v0.s[1]\n" + "fmla v15.4s, v26.4s, v1.s[1]\n" + "fmla v21.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x28, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v16.4s, v29.4s, v1.s[1]\n" + "fmla v22.4s, v29.4s, v2.s[1]\n" + "ldr q29, [x28, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v17.4s, v28.4s, v1.s[1]\n" + "fmla v23.4s, v28.4s, v2.s[1]\n" + "ldr q28, [x28, #0xd0]\n" + "fmla v12.4s, v27.4s, v0.s[1]\n" + "fmla v18.4s, v27.4s, v1.s[1]\n" + "fmla v24.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x28, #0xe0]\n" + "fmla v13.4s, v26.4s, v0.s[1]\n" + "fmla v19.4s, v26.4s, v1.s[1]\n" + "fmla v25.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x28, #0xf0]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v20.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x28, #0x100]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v21.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x28, #0x110]\n" + "fmla v10.4s, v27.4s, v0.s[2]\n" + "fmla v16.4s, v27.4s, v1.s[2]\n" + "fmla v22.4s, v27.4s, v2.s[2]\n" + "ldr q27, [x28, #0x120]\n" + "fmla v11.4s, v26.4s, v0.s[2]\n" + "fmla v17.4s, v26.4s, v1.s[2]\n" + "fmla v23.4s, v26.4s, v2.s[2]\n" + "ldr q26, [x28, #0x130]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v18.4s, v29.4s, v1.s[2]\n" + "fmla v24.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x28, #0x140]\n" + "fmla v13.4s, v28.4s, v0.s[2]\n" + "fmla v19.4s, v28.4s, v1.s[2]\n" + "fmla v25.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x28, #0x150]\n" + "fmla v8.4s, v27.4s, v0.s[3]\n" + "fmla v14.4s, v27.4s, v1.s[3]\n" + "fmla v20.4s, v27.4s, v2.s[3]\n" + "ldr q27, [x28, #0x160]\n" + "fmla v9.4s, v26.4s, v0.s[3]\n" + "fmla v15.4s, v26.4s, v1.s[3]\n" + "fmla v21.4s, v26.4s, v2.s[3]\n" + "ldr q26, [x28, #0x170]\n" "add x28, x28, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "fmla v22.4s, v4.4s, v2.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v16.4s, v29.4s, v1.s[3]\n" + "fmla v22.4s, v29.4s, v2.s[3]\n" "ldr q4, [x28, #0x0]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "fmla v23.4s, v5.4s, v2.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v17.4s, v28.4s, v1.s[3]\n" + "fmla v23.4s, v28.4s, v2.s[3]\n" "ldr q5, [x28, #0x10]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "fmla v24.4s, v6.4s, v2.s[3]\n" + "fmla v12.4s, v27.4s, v0.s[3]\n" + "fmla v18.4s, v27.4s, v1.s[3]\n" + "fmla v24.4s, v27.4s, v2.s[3]\n" "ldr q6, [x28, #0x20]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" + "fmla v13.4s, v26.4s, v0.s[3]\n" "ldr q0, [x24, #0x0]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" + "fmla v19.4s, v26.4s, v1.s[3]\n" "ldr q1, [x23, #0x0]\n" - "fmla v25.4s, v7.4s, v2.s[3]\n" + "fmla v25.4s, v26.4s, v2.s[3]\n" "ldr q2, [x22, #0x0]\n" "ldr q7, [x28, #0x30]\n" "bge 104b\n" @@ -1382,133 +1381,133 @@ void a64_hybrid_fp32_mla_4x24 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "fmla v20.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x28, #0x40]\n" + "ldr q29, [x28, #0x40]\n" "fmla v9.4s, v5.4s, v0.s[0]\n" "add x22, x22, #0x10\n" "fmla v15.4s, v5.4s, v1.s[0]\n" "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr q28, [x28, #0x50]\n" "sub x25, x25, #0x4\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v16.4s, v6.4s, v1.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" "fmla v22.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x28, #0x60]\n" + "ldr q27, [x28, #0x60]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" "prfm pldl1keep, [x22, #0x80]\n" "fmla v17.4s, v7.4s, v1.s[0]\n" "fmla v23.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x28, #0x70]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "ldr q4, [x28, #0x80]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x28, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v20.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x28, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v21.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x28, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v1.s[1]\n" - "fmla v22.4s, v4.4s, v2.s[1]\n" - "ldr q4, [x28, #0xc0]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v17.4s, v5.4s, v1.s[1]\n" - "fmla v23.4s, v5.4s, v2.s[1]\n" - "ldr q5, [x28, #0xd0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v18.4s, v6.4s, v1.s[1]\n" - "fmla v24.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x28, #0xe0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v19.4s, v7.4s, v1.s[1]\n" - "fmla v25.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x28, #0xf0]\n" - "fmla v8.4s, v4.4s, v0.s[2]\n" - "fmla v14.4s, v4.4s, v1.s[2]\n" - "fmla v20.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x28, #0x100]\n" - "fmla v9.4s, v5.4s, v0.s[2]\n" - "fmla v15.4s, v5.4s, v1.s[2]\n" - "fmla v21.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x28, #0x110]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v16.4s, v6.4s, v1.s[2]\n" - "fmla v22.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x28, #0x120]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v1.s[2]\n" - "fmla v23.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x28, #0x130]\n" - "fmla v12.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v1.s[2]\n" - "fmla v24.4s, v4.4s, v2.s[2]\n" - "ldr q4, [x28, #0x140]\n" - "fmla v13.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v1.s[2]\n" - "fmla v25.4s, v5.4s, v2.s[2]\n" - "ldr q5, [x28, #0x150]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v20.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x28, #0x160]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v21.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x28, #0x170]\n" + "ldr q26, [x28, #0x70]\n" + "fmla v12.4s, v29.4s, v0.s[0]\n" + "fmla v18.4s, v29.4s, v1.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[0]\n" + "ldr q29, [x28, #0x80]\n" + "fmla v13.4s, v28.4s, v0.s[0]\n" + "fmla v19.4s, v28.4s, v1.s[0]\n" + "fmla v25.4s, v28.4s, v2.s[0]\n" + "ldr q28, [x28, #0x90]\n" + "fmla v8.4s, v27.4s, v0.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[1]\n" + "fmla v20.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x28, #0xa0]\n" + "fmla v9.4s, v26.4s, v0.s[1]\n" + "fmla v15.4s, v26.4s, v1.s[1]\n" + "fmla v21.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x28, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v16.4s, v29.4s, v1.s[1]\n" + "fmla v22.4s, v29.4s, v2.s[1]\n" + "ldr q29, [x28, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v17.4s, v28.4s, v1.s[1]\n" + "fmla v23.4s, v28.4s, v2.s[1]\n" + "ldr q28, [x28, #0xd0]\n" + "fmla v12.4s, v27.4s, v0.s[1]\n" + "fmla v18.4s, v27.4s, v1.s[1]\n" + "fmla v24.4s, v27.4s, v2.s[1]\n" + "ldr q27, [x28, #0xe0]\n" + "fmla v13.4s, v26.4s, v0.s[1]\n" + "fmla v19.4s, v26.4s, v1.s[1]\n" + "fmla v25.4s, v26.4s, v2.s[1]\n" + "ldr q26, [x28, #0xf0]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v20.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x28, #0x100]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v21.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x28, #0x110]\n" + "fmla v10.4s, v27.4s, v0.s[2]\n" + "fmla v16.4s, v27.4s, v1.s[2]\n" + "fmla v22.4s, v27.4s, v2.s[2]\n" + "ldr q27, [x28, #0x120]\n" + "fmla v11.4s, v26.4s, v0.s[2]\n" + "fmla v17.4s, v26.4s, v1.s[2]\n" + "fmla v23.4s, v26.4s, v2.s[2]\n" + "ldr q26, [x28, #0x130]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v18.4s, v29.4s, v1.s[2]\n" + "fmla v24.4s, v29.4s, v2.s[2]\n" + "ldr q29, [x28, #0x140]\n" + "fmla v13.4s, v28.4s, v0.s[2]\n" + "fmla v19.4s, v28.4s, v1.s[2]\n" + "fmla v25.4s, v28.4s, v2.s[2]\n" + "ldr q28, [x28, #0x150]\n" + "fmla v8.4s, v27.4s, v0.s[3]\n" + "fmla v14.4s, v27.4s, v1.s[3]\n" + "fmla v20.4s, v27.4s, v2.s[3]\n" + "ldr q27, [x28, #0x160]\n" + "fmla v9.4s, v26.4s, v0.s[3]\n" + "fmla v15.4s, v26.4s, v1.s[3]\n" + "fmla v21.4s, v26.4s, v2.s[3]\n" + "ldr q26, [x28, #0x170]\n" "add x28, x28, #0x180\n" - "fmla v10.4s, v4.4s, v0.s[3]\n" - "fmla v16.4s, v4.4s, v1.s[3]\n" - "fmla v22.4s, v4.4s, v2.s[3]\n" - "fmla v11.4s, v5.4s, v0.s[3]\n" - "fmla v17.4s, v5.4s, v1.s[3]\n" - "fmla v23.4s, v5.4s, v2.s[3]\n" - "fmla v12.4s, v6.4s, v0.s[3]\n" - "fmla v18.4s, v6.4s, v1.s[3]\n" - "fmla v24.4s, v6.4s, v2.s[3]\n" - "fmla v13.4s, v7.4s, v0.s[3]\n" - "fmla v19.4s, v7.4s, v1.s[3]\n" - "fmla v25.4s, v7.4s, v2.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v16.4s, v29.4s, v1.s[3]\n" + "fmla v22.4s, v29.4s, v2.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v17.4s, v28.4s, v1.s[3]\n" + "fmla v23.4s, v28.4s, v2.s[3]\n" + "fmla v12.4s, v27.4s, v0.s[3]\n" + "fmla v18.4s, v27.4s, v1.s[3]\n" + "fmla v24.4s, v27.4s, v2.s[3]\n" + "fmla v13.4s, v26.4s, v0.s[3]\n" + "fmla v19.4s, v26.4s, v1.s[3]\n" + "fmla v25.4s, v26.4s, v2.s[3]\n" "106:" // Height 3: Multiply loop: Main loop skip "cbz x25, 108f\n" "107:" // Height 3: Multiply loop: Odd block loop "ldr s0, [x24], #0x4\n" - "ldr s1, [x23], #0x4\n" + "ldr s31, [x23], #0x4\n" "sub x25, x25, #0x1\n" - "ldr s2, [x22], #0x4\n" - "ldr q4, [x28, #0x0]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q5, [x28, #0x10]\n" - "ldr q6, [x28, #0x20]\n" - "fmla v20.4s, v4.4s, v2.s[0]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "ldr q7, [x28, #0x30]\n" - "ldr q4, [x28, #0x40]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "fmla v21.4s, v5.4s, v2.s[0]\n" - "ldr q5, [x28, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" + "ldr s30, [x22], #0x4\n" + "ldr q27, [x28, #0x0]\n" + "fmla v8.4s, v27.4s, v0.s[0]\n" + "fmla v14.4s, v27.4s, v31.s[0]\n" + "ldr q26, [x28, #0x10]\n" + "ldr q29, [x28, #0x20]\n" + "fmla v20.4s, v27.4s, v30.s[0]\n" + "fmla v9.4s, v26.4s, v0.s[0]\n" + "ldr q28, [x28, #0x30]\n" + "ldr q27, [x28, #0x40]\n" + "fmla v15.4s, v26.4s, v31.s[0]\n" + "fmla v21.4s, v26.4s, v30.s[0]\n" + "ldr q26, [x28, #0x50]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v16.4s, v29.4s, v31.s[0]\n" "add x28, x28, #0x60\n" - "fmla v22.4s, v6.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" - "fmla v23.4s, v7.4s, v2.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v22.4s, v29.4s, v30.s[0]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v17.4s, v28.4s, v31.s[0]\n" + "fmla v23.4s, v28.4s, v30.s[0]\n" + "fmla v12.4s, v27.4s, v0.s[0]\n" + "fmla v18.4s, v27.4s, v31.s[0]\n" + "fmla v24.4s, v27.4s, v30.s[0]\n" + "fmla v13.4s, v26.4s, v0.s[0]\n" + "fmla v19.4s, v26.4s, v31.s[0]\n" + "fmla v25.4s, v26.4s, v30.s[0]\n" "cbnz x25, 107b\n" "108:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1523,45 +1522,45 @@ void a64_hybrid_fp32_mla_4x24 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 109f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v27.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" + "ld1r { v26.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v27.4s\n" + "fmin v9.4s, v9.4s, v27.4s\n" + "fmin v10.4s, v10.4s, v27.4s\n" + "fmin v11.4s, v11.4s, v27.4s\n" + "fmin v12.4s, v12.4s, v27.4s\n" + "fmin v13.4s, v13.4s, v27.4s\n" + "fmin v14.4s, v14.4s, v27.4s\n" + "fmin v15.4s, v15.4s, v27.4s\n" + "fmin v16.4s, v16.4s, v27.4s\n" + "fmin v17.4s, v17.4s, v27.4s\n" + "fmin v18.4s, v18.4s, v27.4s\n" + "fmin v19.4s, v19.4s, v27.4s\n" + "fmin v20.4s, v20.4s, v27.4s\n" + "fmin v21.4s, v21.4s, v27.4s\n" + "fmin v22.4s, v22.4s, v27.4s\n" + "fmin v23.4s, v23.4s, v27.4s\n" + "fmin v24.4s, v24.4s, v27.4s\n" + "fmin v25.4s, v25.4s, v27.4s\n" + "fmax v8.4s, v8.4s, v26.4s\n" + "fmax v9.4s, v9.4s, v26.4s\n" + "fmax v10.4s, v10.4s, v26.4s\n" + "fmax v11.4s, v11.4s, v26.4s\n" + "fmax v12.4s, v12.4s, v26.4s\n" + "fmax v13.4s, v13.4s, v26.4s\n" + "fmax v14.4s, v14.4s, v26.4s\n" + "fmax v15.4s, v15.4s, v26.4s\n" + "fmax v16.4s, v16.4s, v26.4s\n" + "fmax v17.4s, v17.4s, v26.4s\n" + "fmax v18.4s, v18.4s, v26.4s\n" + "fmax v19.4s, v19.4s, v26.4s\n" + "fmax v20.4s, v20.4s, v26.4s\n" + "fmax v21.4s, v21.4s, v26.4s\n" + "fmax v22.4s, v22.4s, v26.4s\n" + "fmax v23.4s, v23.4s, v26.4s\n" + "fmax v24.4s, v24.4s, v26.4s\n" + "fmax v25.4s, v25.4s, v26.4s\n" "109:" // Height 3: No activation "cmp x9, #0x18\n" "bge 122f\n" @@ -1983,14 +1982,14 @@ void a64_hybrid_fp32_mla_4x24 ( "142:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 143f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 144f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -2000,9 +1999,9 @@ void a64_hybrid_fp32_mla_4x24 ( "b 144f\n" "143:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "144:" // Height 4: input setup done "cmp x25, #0x4\n" "blt 147f\n" @@ -2283,42 +2282,42 @@ void a64_hybrid_fp32_mla_4x24 ( "147:" // Height 4: Multiply loop: Main loop skip "cbz x25, 149f\n" "148:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x24], #0x4\n" - "ldr s1, [x23], #0x4\n" + "ldr s7, [x24], #0x4\n" + "ldr s6, [x23], #0x4\n" "sub x25, x25, #0x1\n" - "ldr s2, [x22], #0x4\n" - "ldr s3, [x21], #0x4\n" - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "fmla v14.4s, v4.4s, v1.s[0]\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - "fmla v20.4s, v4.4s, v2.s[0]\n" - "fmla v26.4s, v4.4s, v3.s[0]\n" - "ldr q4, [x28, #0x40]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v15.4s, v5.4s, v1.s[0]\n" - "fmla v21.4s, v5.4s, v2.s[0]\n" - "fmla v27.4s, v5.4s, v3.s[0]\n" - "ldr q5, [x28, #0x50]\n" + "ldr s5, [x22], #0x4\n" + "ldr s4, [x21], #0x4\n" + "ldr q1, [x28, #0x0]\n" + "ldr q0, [x28, #0x10]\n" + "fmla v8.4s, v1.4s, v7.s[0]\n" + "fmla v14.4s, v1.4s, v6.s[0]\n" + "ldr q3, [x28, #0x20]\n" + "ldr q2, [x28, #0x30]\n" + "fmla v20.4s, v1.4s, v5.s[0]\n" + "fmla v26.4s, v1.4s, v4.s[0]\n" + "ldr q1, [x28, #0x40]\n" + "fmla v9.4s, v0.4s, v7.s[0]\n" + "fmla v15.4s, v0.4s, v6.s[0]\n" + "fmla v21.4s, v0.4s, v5.s[0]\n" + "fmla v27.4s, v0.4s, v4.s[0]\n" + "ldr q0, [x28, #0x50]\n" "add x28, x28, #0x60\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v16.4s, v6.4s, v1.s[0]\n" - "fmla v22.4s, v6.4s, v2.s[0]\n" - "fmla v28.4s, v6.4s, v3.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v17.4s, v7.4s, v1.s[0]\n" - "fmla v23.4s, v7.4s, v2.s[0]\n" - "fmla v29.4s, v7.4s, v3.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[0]\n" - "fmla v18.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[0]\n" - "fmla v30.4s, v4.4s, v3.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[0]\n" - "fmla v19.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v2.s[0]\n" - "fmla v31.4s, v5.4s, v3.s[0]\n" + "fmla v10.4s, v3.4s, v7.s[0]\n" + "fmla v16.4s, v3.4s, v6.s[0]\n" + "fmla v22.4s, v3.4s, v5.s[0]\n" + "fmla v28.4s, v3.4s, v4.s[0]\n" + "fmla v11.4s, v2.4s, v7.s[0]\n" + "fmla v17.4s, v2.4s, v6.s[0]\n" + "fmla v23.4s, v2.4s, v5.s[0]\n" + "fmla v29.4s, v2.4s, v4.s[0]\n" + "fmla v12.4s, v1.4s, v7.s[0]\n" + "fmla v18.4s, v1.4s, v6.s[0]\n" + "fmla v24.4s, v1.4s, v5.s[0]\n" + "fmla v30.4s, v1.4s, v4.s[0]\n" + "fmla v13.4s, v0.4s, v7.s[0]\n" + "fmla v19.4s, v0.4s, v6.s[0]\n" + "fmla v25.4s, v0.4s, v5.s[0]\n" + "fmla v31.4s, v0.4s, v4.s[0]\n" "cbnz x25, 148b\n" "149:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2584,7 +2583,6 @@ void a64_hybrid_fp32_mla_4x24 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "166:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp index 4cfa18bb84..759729de5e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -113,5 +113,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp index 985d57d9b6..ddbc840829 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16_a55 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 166f\n" @@ -189,11 +188,11 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" "cbnz x15, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" @@ -210,126 +209,126 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "blt 19f\n" "18:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr d6, [x17, #0x20]\n" - "ldr x12, [x17, #0x28]\n" + "ldr d17, [x17, #0x20]\n" + "ldr x20, [x17, #0x28]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x38]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr d6, [x17, #0x40]\n" - "ldr x12, [x17, #0x48]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x58]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr d6, [x17, #0x60]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x78]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr d6, [x17, #0x80]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x98]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr d6, [x17, #0xa0]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xb8]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr d6, [x17, #0xc0]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xd8]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr d6, [x17, #0xe0]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xf8]\n" - "mov v7.d[1], x11\n" + "ldr d16, [x17, #0x30]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x38]\n" + "mov v16.d[1], x20\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr d17, [x17, #0x40]\n" + "ldr x20, [x17, #0x48]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr d16, [x17, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x58]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr d17, [x17, #0x60]\n" + "ldr x20, [x17, #0x68]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr d16, [x17, #0x70]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x78]\n" + "mov v16.d[1], x20\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr d17, [x17, #0x80]\n" + "ldr x20, [x17, #0x88]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr d16, [x17, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x98]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr d17, [x17, #0xa0]\n" + "ldr x20, [x17, #0xa8]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr d16, [x17, #0xb0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xb8]\n" + "mov v16.d[1], x20\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr d17, [x17, #0xc0]\n" + "ldr x20, [x17, #0xc8]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr d16, [x17, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xd8]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr d17, [x17, #0xe0]\n" + "ldr x20, [x17, #0xe8]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr d16, [x17, #0xf0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xf8]\n" + "mov v16.d[1], x20\n" "add x13, x13, #0x10\n" "add x17, x17, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" "ldr d6, [x17, #0x0]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr x20, [x17, #0x8]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" "sub x14, x14, #0x4\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x8\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x18]\n" - "mov v0.d[1], x10\n" - "mov v7.d[1], x11\n" + "ldr x21, [x13, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x13, #0x80]\n" "bge 18b\n" "19:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q17, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x17, #0xf0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr q17, [x17, #0x40]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr q16, [x17, #0x50]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x17, #0x60]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x17, #0x70]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x17, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x17, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x17, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x17, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x17, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x17, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x17, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x17, #0xf0]\n" "add x13, x13, #0x10\n" "sub x14, x14, #0x4\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "add x17, x17, #0x100\n" "20:" // Height 1: Multiply loop: Main loop skip "cbz x14, 22f\n" "21:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s17, [x13], #0x4\n" "sub x14, x14, #0x1\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q16, [x17, #0x0]\n" + "fmla v8.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x17, #0x10]\n" + "fmla v9.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x17, #0x20]\n" + "fmla v10.4s, v16.4s, v17.s[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v11.4s, v16.4s, v17.s[0]\n" "add x17, x17, #0x40\n" "cbnz x14, 21b\n" "22:" // Height 1: Multiply loop: No odd multiplies @@ -340,17 +339,17 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "prfm pstl1keep, [x16, #0x0]\n" "tbz %x[flags], #1, 23f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v16.4s\n" + "fmin v9.4s, v9.4s, v16.4s\n" + "fmin v10.4s, v10.4s, v16.4s\n" + "fmin v11.4s, v11.4s, v16.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" "23:" // Height 1: No activation "cmp x8, #0x10\n" "bge 32f\n" @@ -528,196 +527,196 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" "cbnz x15, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" - "add x9, x9, x20, LSL #2\n" + "add x12, x12, x20, LSL #2\n" "b 50f\n" "49:" // Height 2: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #2\n" + "add x12, x13, x21, LSL #2\n" "50:" // Height 2: input setup done "cmp x14, #0x4\n" "blt 53f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x8\n" - "ldr q1, [x9, #0x0]\n" + "ldr q1, [x12, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 52f\n" "51:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d17, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v6.d[1], x12\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "mov v7.d[1], x11\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr x12, [x17, #0x48]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x58]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v6.d[1], x12\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "mov v7.d[1], x11\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x98]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v6.d[1], x12\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "mov v7.d[1], x11\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0xd8]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v6.d[1], x12\n" + "ldr d16, [x17, #0x30]\n" + "mov v17.d[1], x21\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "mov v16.d[1], x20\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr d17, [x17, #0x40]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr x20, [x17, #0x48]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr d16, [x17, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x58]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr d17, [x17, #0x60]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr d16, [x17, #0x70]\n" + "mov v17.d[1], x21\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "mov v16.d[1], x20\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr d17, [x17, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr x20, [x17, #0x88]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr d16, [x17, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0x98]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr d17, [x17, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr d16, [x17, #0xb0]\n" + "mov v17.d[1], x21\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "mov v16.d[1], x20\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr d17, [x17, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr x20, [x17, #0xc8]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr d16, [x17, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x17, #0xd8]\n" + "mov v16.d[1], x20\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr d17, [x17, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr d16, [x17, #0xf0]\n" + "mov v17.d[1], x21\n" "add x13, x13, #0x10\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" + "mov v16.d[1], x20\n" + "add x12, x12, #0x10\n" "add x17, x17, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" "ldr d6, [x17, #0x0]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" + "ldr d1, [x12, #0x0]\n" "sub x14, x14, #0x4\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x8\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x28, [x9, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x17, #0x18]\n" - "mov v1.d[1], x28\n" + "ldr x20, [x13, #0x8]\n" + "mov v6.d[1], x21\n" + "ldr x21, [x12, #0x8]\n" + "mov v0.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v1.d[1], x21\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v7.d[1], x11\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v7.d[1], x20\n" + "prfm pldl1keep, [x12, #0x80]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "add x13, x13, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q17, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" "sub x14, x14, #0x4\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr q17, [x17, #0x40]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr q16, [x17, #0x50]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x17, #0x60]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x17, #0x70]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x17, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x17, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x17, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x17, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x17, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x17, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr q17, [x17, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr q16, [x17, #0xf0]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" "add x17, x17, #0x100\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x14, 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s19, [x13], #0x4\n" "sub x14, x14, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s18, [x12], #0x4\n" + "ldr q17, [x17, #0x0]\n" + "fmla v8.4s, v17.4s, v19.s[0]\n" + "ldr q16, [x17, #0x10]\n" + "fmla v12.4s, v17.4s, v18.s[0]\n" + "ldr q17, [x17, #0x20]\n" + "fmla v9.4s, v16.4s, v19.s[0]\n" + "fmla v13.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x17, #0x30]\n" + "fmla v10.4s, v17.4s, v19.s[0]\n" "add x17, x17, #0x40\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v14.4s, v17.4s, v18.s[0]\n" + "fmla v11.4s, v16.4s, v19.s[0]\n" + "fmla v15.4s, v16.4s, v18.s[0]\n" "cbnz x14, 54b\n" "55:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -730,25 +729,25 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "prfm pstl1keep, [x25, #0x0]\n" "tbz %x[flags], #1, 56f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v16.4s\n" + "fmin v9.4s, v9.4s, v16.4s\n" + "fmin v10.4s, v10.4s, v16.4s\n" + "fmin v11.4s, v11.4s, v16.4s\n" + "fmin v12.4s, v12.4s, v16.4s\n" + "fmin v13.4s, v13.4s, v16.4s\n" + "fmin v14.4s, v14.4s, v16.4s\n" + "fmin v15.4s, v15.4s, v16.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" + "fmax v14.4s, v14.4s, v16.4s\n" + "fmax v15.4s, v15.4s, v16.4s\n" "56:" // Height 2: No activation "cmp x8, #0x10\n" "bge 65f\n" @@ -975,244 +974,244 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "81:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 82f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" "cbnz x15, 83f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" - "add x9, x9, x20, LSL #2\n" - "add x27, x27, x20, LSL #2\n" + "add x12, x12, x20, LSL #2\n" + "add x11, x11, x20, LSL #2\n" "b 83f\n" "82:" // Height 3: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #2\n" - "add x27, x9, x20, LSL #2\n" + "add x12, x13, x21, LSL #2\n" + "add x11, x12, x21, LSL #2\n" "83:" // Height 3: input setup done "cmp x14, #0x4\n" "blt 86f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x8\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 85f\n" "84:" // Height 3: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d21, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" + "mov v21.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" + "ldr d20, [x17, #0x30]\n" + "mov v20.d[1], x20\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr d21, [x17, #0x40]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" + "mov v21.d[1], x21\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr d20, [x17, #0x50]\n" + "mov v20.d[1], x20\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr d21, [x17, #0x60]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "mov v21.d[1], x21\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr d20, [x17, #0x70]\n" + "mov v20.d[1], x20\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr d21, [x17, #0x80]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "mov v21.d[1], x21\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr d20, [x17, #0x90]\n" + "mov v20.d[1], x20\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr d21, [x17, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "mov v21.d[1], x21\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr d20, [x17, #0xb0]\n" + "mov v20.d[1], x20\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr d21, [x17, #0xc0]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "mov v21.d[1], x21\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr d20, [x17, #0xd0]\n" + "mov v20.d[1], x20\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr d21, [x17, #0xe0]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "mov v21.d[1], x21\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" "add x13, x13, #0x10\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr d20, [x17, #0xf0]\n" + "mov v20.d[1], x20\n" + "add x12, x12, #0x10\n" + "add x11, x11, #0x10\n" "add x17, x17, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" + "ldr x20, [x17, #0x8]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "ldr x23, [x13, #0x8]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" "ldr d6, [x17, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "ldr d1, [x12, #0x0]\n" + "ldr x22, [x12, #0x8]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" + "ldr d2, [x11, #0x0]\n" "sub x14, x14, #0x4\n" "ldr d7, [x17, #0x10]\n" "cmp x14, #0x8\n" - "ldr x26, [x27, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x17, #0x18]\n" - "mov v0.d[1], x10\n" + "ldr x21, [x11, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x17, #0x18]\n" + "mov v0.d[1], x23\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v1.d[1], x28\n" - "prfm pldl1keep, [x9, #0x80]\n" - "mov v2.d[1], x26\n" - "prfm pldl1keep, [x27, #0x80]\n" - "mov v7.d[1], x11\n" + "mov v1.d[1], x22\n" + "prfm pldl1keep, [x12, #0x80]\n" + "mov v2.d[1], x21\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v7.d[1], x20\n" "bge 84b\n" "85:" // Height 3: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "add x13, x13, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q21, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "sub x14, x14, #0x4\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr q20, [x17, #0x30]\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr q21, [x17, #0x40]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr q20, [x17, #0x50]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x17, #0x60]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x17, #0x70]\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x17, #0x80]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x17, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x17, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x17, #0xb0]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x17, #0xc0]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x17, #0xd0]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr q21, [x17, #0xe0]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr q20, [x17, #0xf0]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" "add x17, x17, #0x100\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" "86:" // Height 3: Multiply loop: Main loop skip - "cbz x14, 88f\n" - "87:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" - "sub x14, x14, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "add x17, x17, #0x40\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" + "cbz x14, 88f\n" + "87:" // Height 3: Multiply loop: Odd block loop + "ldr s24, [x13], #0x4\n" + "sub x14, x14, #0x1\n" + "ldr s23, [x12], #0x4\n" + "ldr s22, [x11], #0x4\n" + "ldr q21, [x17, #0x0]\n" + "fmla v8.4s, v21.4s, v24.s[0]\n" + "ldr q20, [x17, #0x10]\n" + "fmla v12.4s, v21.4s, v23.s[0]\n" + "fmla v16.4s, v21.4s, v22.s[0]\n" + "ldr q21, [x17, #0x20]\n" + "fmla v9.4s, v20.4s, v24.s[0]\n" + "fmla v13.4s, v20.4s, v23.s[0]\n" + "fmla v17.4s, v20.4s, v22.s[0]\n" + "ldr q20, [x17, #0x30]\n" + "fmla v10.4s, v21.4s, v24.s[0]\n" + "add x17, x17, #0x40\n" + "fmla v14.4s, v21.4s, v23.s[0]\n" + "fmla v18.4s, v21.4s, v22.s[0]\n" + "fmla v11.4s, v20.4s, v24.s[0]\n" + "fmla v15.4s, v20.4s, v23.s[0]\n" + "fmla v19.4s, v20.4s, v22.s[0]\n" "cbnz x14, 87b\n" "88:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1227,33 +1226,33 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "prfm pstl1keep, [x24, #0x0]\n" "tbz %x[flags], #1, 89f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" - "fmin v16.4s, v16.4s, v0.4s\n" - "fmin v17.4s, v17.4s, v0.4s\n" - "fmin v18.4s, v18.4s, v0.4s\n" - "fmin v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v20.4s\n" + "fmin v9.4s, v9.4s, v20.4s\n" + "fmin v10.4s, v10.4s, v20.4s\n" + "fmin v11.4s, v11.4s, v20.4s\n" + "fmin v12.4s, v12.4s, v20.4s\n" + "fmin v13.4s, v13.4s, v20.4s\n" + "fmin v14.4s, v14.4s, v20.4s\n" + "fmin v15.4s, v15.4s, v20.4s\n" + "fmin v16.4s, v16.4s, v20.4s\n" + "fmin v17.4s, v17.4s, v20.4s\n" + "fmin v18.4s, v18.4s, v20.4s\n" + "fmin v19.4s, v19.4s, v20.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "89:" // Height 3: No activation "cmp x8, #0x10\n" "bge 98f\n" @@ -1529,292 +1528,292 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "114:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 115f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" "cbnz x15, 116f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" - "add x9, x9, x20, LSL #2\n" - "add x27, x27, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" + "add x12, x12, x20, LSL #2\n" + "add x11, x11, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" "b 116f\n" "115:" // Height 4: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #2\n" - "add x27, x9, x20, LSL #2\n" - "add x25, x27, x20, LSL #2\n" + "add x12, x13, x21, LSL #2\n" + "add x11, x12, x21, LSL #2\n" + "add x10, x11, x21, LSL #2\n" "116:" // Height 4: input setup done "cmp x14, #0x4\n" "blt 119f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x8\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 118f\n" "117:" // Height 4: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "add x13, x13, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d25, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" + "mov v25.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "add x27, x27, #0x10\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "add x25, x25, #0x10\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr x26, [x27, #0x8]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr x24, [x25, #0x8]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" + "ldr d24, [x17, #0x30]\n" + "mov v24.d[1], x20\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "add x11, x11, #0x10\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr d25, [x17, #0x40]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "mov v25.d[1], x21\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "add x10, x10, #0x10\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr d24, [x17, #0x50]\n" + "mov v24.d[1], x20\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "ldr x25, [x13, #0x8]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr d25, [x17, #0x60]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "mov v25.d[1], x21\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "ldr x24, [x12, #0x8]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr d24, [x17, #0x70]\n" + "mov v24.d[1], x20\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "ldr x23, [x11, #0x8]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr d25, [x17, #0x80]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "mov v25.d[1], x21\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "ldr x22, [x10, #0x8]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr d24, [x17, #0x90]\n" + "mov v24.d[1], x20\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" "sub x14, x14, #0x4\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr d25, [x17, #0xa0]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "mov v25.d[1], x21\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" "cmp x14, #0x8\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr d24, [x17, #0xb0]\n" + "mov v24.d[1], x20\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr d25, [x17, #0xc0]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "mov v25.d[1], x21\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr d24, [x17, #0xd0]\n" + "mov v24.d[1], x20\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr d25, [x17, #0xe0]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "mov v25.d[1], x21\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr d24, [x17, #0xf0]\n" + "mov v24.d[1], x20\n" "add x17, x17, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0x18]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "ldr x20, [x17, #0x18]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" "ldr d6, [x17, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" - "ldr d3, [x25, #0x0]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" + "ldr d1, [x12, #0x0]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" + "ldr d2, [x11, #0x0]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" + "ldr d3, [x10, #0x0]\n" "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v7.d[1], x11\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x24\n" + "mov v2.d[1], x23\n" + "mov v3.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 117b\n" "118:" // Height 4: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "add x13, x13, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q25, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "sub x14, x14, #0x4\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x13, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "ldr q24, [x17, #0x30]\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr q25, [x17, #0x40]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr q24, [x17, #0x50]\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x17, #0x60]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x17, #0x70]\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x17, #0x80]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x17, #0x90]\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x17, #0xa0]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x17, #0xb0]\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x17, #0xc0]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x17, #0xd0]\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr q25, [x17, #0xe0]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr q24, [x17, #0xf0]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" "add x17, x17, #0x100\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" "119:" // Height 4: Multiply loop: Main loop skip "cbz x14, 121f\n" "120:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s29, [x13], #0x4\n" "sub x14, x14, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s28, [x12], #0x4\n" + "ldr s27, [x11], #0x4\n" + "ldr s26, [x10], #0x4\n" + "ldr q25, [x17, #0x0]\n" + "fmla v8.4s, v25.4s, v29.s[0]\n" + "ldr q24, [x17, #0x10]\n" + "fmla v12.4s, v25.4s, v28.s[0]\n" + "fmla v16.4s, v25.4s, v27.s[0]\n" + "fmla v20.4s, v25.4s, v26.s[0]\n" + "ldr q25, [x17, #0x20]\n" + "fmla v9.4s, v24.4s, v29.s[0]\n" + "fmla v13.4s, v24.4s, v28.s[0]\n" + "fmla v17.4s, v24.4s, v27.s[0]\n" + "fmla v21.4s, v24.4s, v26.s[0]\n" + "ldr q24, [x17, #0x30]\n" + "fmla v10.4s, v25.4s, v29.s[0]\n" "add x17, x17, #0x40\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v14.4s, v25.4s, v28.s[0]\n" + "fmla v18.4s, v25.4s, v27.s[0]\n" + "fmla v22.4s, v25.4s, v26.s[0]\n" + "fmla v11.4s, v24.4s, v29.s[0]\n" + "fmla v15.4s, v24.4s, v28.s[0]\n" + "fmla v19.4s, v24.4s, v27.s[0]\n" + "fmla v23.4s, v24.4s, v26.s[0]\n" "cbnz x14, 120b\n" "121:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1831,41 +1830,41 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 122f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" - "fmin v16.4s, v16.4s, v0.4s\n" - "fmin v17.4s, v17.4s, v0.4s\n" - "fmin v18.4s, v18.4s, v0.4s\n" - "fmin v19.4s, v19.4s, v0.4s\n" - "fmin v20.4s, v20.4s, v0.4s\n" - "fmin v21.4s, v21.4s, v0.4s\n" - "fmin v22.4s, v22.4s, v0.4s\n" - "fmin v23.4s, v23.4s, v0.4s\n" + "ld1r { v24.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v24.4s\n" + "fmin v9.4s, v9.4s, v24.4s\n" + "fmin v10.4s, v10.4s, v24.4s\n" + "fmin v11.4s, v11.4s, v24.4s\n" + "fmin v12.4s, v12.4s, v24.4s\n" + "fmin v13.4s, v13.4s, v24.4s\n" + "fmin v14.4s, v14.4s, v24.4s\n" + "fmin v15.4s, v15.4s, v24.4s\n" + "fmin v16.4s, v16.4s, v24.4s\n" + "fmin v17.4s, v17.4s, v24.4s\n" + "fmin v18.4s, v18.4s, v24.4s\n" + "fmin v19.4s, v19.4s, v24.4s\n" + "fmin v20.4s, v20.4s, v24.4s\n" + "fmin v21.4s, v21.4s, v24.4s\n" + "fmin v22.4s, v22.4s, v24.4s\n" + "fmin v23.4s, v23.4s, v24.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" + "ld1r { v24.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v24.4s\n" + "fmax v9.4s, v9.4s, v24.4s\n" + "fmax v10.4s, v10.4s, v24.4s\n" + "fmax v11.4s, v11.4s, v24.4s\n" + "fmax v12.4s, v12.4s, v24.4s\n" + "fmax v13.4s, v13.4s, v24.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "fmax v23.4s, v23.4s, v24.4s\n" "122:" // Height 4: No activation "cmp x8, #0x10\n" "bge 131f\n" @@ -2190,340 +2189,340 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "147:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 148f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" "cbnz x15, 149f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" + "add x12, x12, x20, LSL #2\n" + "add x11, x11, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" "add x9, x9, x20, LSL #2\n" - "add x27, x27, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" "b 149f\n" "148:" // Height 5: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #2\n" - "add x27, x9, x20, LSL #2\n" - "add x25, x27, x20, LSL #2\n" - "add x23, x25, x20, LSL #2\n" + "add x12, x13, x21, LSL #2\n" + "add x11, x12, x21, LSL #2\n" + "add x10, x11, x21, LSL #2\n" + "add x9, x10, x21, LSL #2\n" "149:" // Height 5: input setup done "cmp x14, #0x4\n" "blt 152f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x8\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 151f\n" "150:" // Height 5: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "add x13, x13, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr d6, [x17, #0x20]\n" + "ldr d29, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" + "mov v29.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x58]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "add x23, x23, #0x10\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr x10, [x13, #0x8]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr d6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x68]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr x28, [x9, #0x8]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr x26, [x27, #0x8]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x78]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr x24, [x25, #0x8]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr x22, [x23, #0x8]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr d6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0x88]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" + "ldr d28, [x17, #0x30]\n" + "mov v28.d[1], x20\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "ldr x20, [x17, #0x58]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "add x9, x9, #0x10\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" + "ldr x26, [x13, #0x8]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr d29, [x17, #0x40]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "mov v29.d[1], x21\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "ldr x21, [x17, #0x68]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "ldr x25, [x12, #0x8]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "ldr x24, [x11, #0x8]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr d28, [x17, #0x50]\n" + "mov v28.d[1], x20\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "ldr x20, [x17, #0x78]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "ldr x23, [x10, #0x8]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "ldr x22, [x9, #0x8]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr d29, [x17, #0x60]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "mov v29.d[1], x21\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "ldr x21, [x17, #0x88]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" "sub x14, x14, #0x4\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" "cmp x14, #0x8\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x98]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr d28, [x17, #0x70]\n" + "mov v28.d[1], x20\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "ldr x20, [x17, #0x98]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" "prfm pldl1keep, [x13, #0x80]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr d29, [x17, #0x80]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "mov v29.d[1], x21\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "ldr x21, [x17, #0xa8]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr d28, [x17, #0x90]\n" + "mov v28.d[1], x20\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "ldr x20, [x17, #0xb8]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" "prfm pldl1keep, [x9, #0x80]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr d6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0xa8]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xb8]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr d6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xc8]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xd8]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr d6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xe8]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0xf8]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr d6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x12\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr d29, [x17, #0xa0]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "mov v29.d[1], x21\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "ldr x21, [x17, #0xc8]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr d28, [x17, #0xb0]\n" + "mov v28.d[1], x20\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "ldr x20, [x17, #0xd8]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr d29, [x17, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" + "mov v29.d[1], x21\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "ldr x21, [x17, #0xe8]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr d28, [x17, #0xd0]\n" + "mov v28.d[1], x20\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "ldr x20, [x17, #0xf8]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr d29, [x17, #0xe0]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" + "mov v29.d[1], x21\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr d28, [x17, #0xf0]\n" + "mov v28.d[1], x20\n" "add x17, x17, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "ldr x12, [x17, #0x8]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0x18]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "ldr x21, [x17, #0x8]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "ldr x20, [x17, #0x18]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" "ldr d6, [x17, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" - "ldr d3, [x25, #0x0]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" - "ldr d4, [x23, #0x0]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" + "ldr d1, [x12, #0x0]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" + "ldr d2, [x11, #0x0]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" + "ldr d3, [x10, #0x0]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" + "ldr d4, [x9, #0x0]\n" "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x26\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" "mov v4.d[1], x22\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "bge 150b\n" "151:" // Height 5: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "add x13, x13, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x17, #0x20]\n" + "ldr q29, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "sub x14, x14, #0x4\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x13, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x17, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x17, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x17, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x17, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x17, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x17, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x17, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x17, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x17, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x17, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x17, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x17, #0xf0]\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" + "ldr q28, [x17, #0x30]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr q29, [x17, #0x40]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr q28, [x17, #0x50]\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x17, #0x60]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x17, #0x70]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x17, #0x80]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x17, #0x90]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x17, #0xa0]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x17, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x17, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x17, #0xd0]\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr q29, [x17, #0xe0]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr q28, [x17, #0xf0]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" "add x17, x17, #0x100\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" "152:" // Height 5: Multiply loop: Main loop skip "cbz x14, 154f\n" "153:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s2, [x13], #0x4\n" "sub x14, x14, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s1, [x12], #0x4\n" + "ldr s0, [x11], #0x4\n" + "ldr s31, [x10], #0x4\n" + "ldr s30, [x9], #0x4\n" + "ldr q29, [x17, #0x0]\n" + "fmla v8.4s, v29.4s, v2.s[0]\n" + "ldr q28, [x17, #0x10]\n" + "fmla v12.4s, v29.4s, v1.s[0]\n" + "fmla v16.4s, v29.4s, v0.s[0]\n" + "fmla v20.4s, v29.4s, v31.s[0]\n" + "fmla v24.4s, v29.4s, v30.s[0]\n" + "ldr q29, [x17, #0x20]\n" + "fmla v9.4s, v28.4s, v2.s[0]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v17.4s, v28.4s, v0.s[0]\n" + "fmla v21.4s, v28.4s, v31.s[0]\n" + "fmla v25.4s, v28.4s, v30.s[0]\n" + "ldr q28, [x17, #0x30]\n" + "fmla v10.4s, v29.4s, v2.s[0]\n" "add x17, x17, #0x40\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v18.4s, v29.4s, v0.s[0]\n" + "fmla v22.4s, v29.4s, v31.s[0]\n" + "fmla v26.4s, v29.4s, v30.s[0]\n" + "fmla v11.4s, v28.4s, v2.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v0.s[0]\n" + "fmla v23.4s, v28.4s, v31.s[0]\n" + "fmla v27.4s, v28.4s, v30.s[0]\n" "cbnz x14, 153b\n" "154:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2542,49 +2541,49 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 155f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v0.4s\n" - "fmin v9.4s, v9.4s, v0.4s\n" - "fmin v10.4s, v10.4s, v0.4s\n" - "fmin v11.4s, v11.4s, v0.4s\n" - "fmin v12.4s, v12.4s, v0.4s\n" - "fmin v13.4s, v13.4s, v0.4s\n" - "fmin v14.4s, v14.4s, v0.4s\n" - "fmin v15.4s, v15.4s, v0.4s\n" - "fmin v16.4s, v16.4s, v0.4s\n" - "fmin v17.4s, v17.4s, v0.4s\n" - "fmin v18.4s, v18.4s, v0.4s\n" - "fmin v19.4s, v19.4s, v0.4s\n" - "fmin v20.4s, v20.4s, v0.4s\n" - "fmin v21.4s, v21.4s, v0.4s\n" - "fmin v22.4s, v22.4s, v0.4s\n" - "fmin v23.4s, v23.4s, v0.4s\n" - "fmin v24.4s, v24.4s, v0.4s\n" - "fmin v25.4s, v25.4s, v0.4s\n" - "fmin v26.4s, v26.4s, v0.4s\n" - "fmin v27.4s, v27.4s, v0.4s\n" + "ld1r { v28.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v28.4s\n" + "fmin v9.4s, v9.4s, v28.4s\n" + "fmin v10.4s, v10.4s, v28.4s\n" + "fmin v11.4s, v11.4s, v28.4s\n" + "fmin v12.4s, v12.4s, v28.4s\n" + "fmin v13.4s, v13.4s, v28.4s\n" + "fmin v14.4s, v14.4s, v28.4s\n" + "fmin v15.4s, v15.4s, v28.4s\n" + "fmin v16.4s, v16.4s, v28.4s\n" + "fmin v17.4s, v17.4s, v28.4s\n" + "fmin v18.4s, v18.4s, v28.4s\n" + "fmin v19.4s, v19.4s, v28.4s\n" + "fmin v20.4s, v20.4s, v28.4s\n" + "fmin v21.4s, v21.4s, v28.4s\n" + "fmin v22.4s, v22.4s, v28.4s\n" + "fmin v23.4s, v23.4s, v28.4s\n" + "fmin v24.4s, v24.4s, v28.4s\n" + "fmin v25.4s, v25.4s, v28.4s\n" + "fmin v26.4s, v26.4s, v28.4s\n" + "fmin v27.4s, v27.4s, v28.4s\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" + "ld1r { v28.4s }, [x20]\n" + "fmax v8.4s, v8.4s, v28.4s\n" + "fmax v9.4s, v9.4s, v28.4s\n" + "fmax v10.4s, v10.4s, v28.4s\n" + "fmax v11.4s, v11.4s, v28.4s\n" + "fmax v12.4s, v12.4s, v28.4s\n" + "fmax v13.4s, v13.4s, v28.4s\n" + "fmax v14.4s, v14.4s, v28.4s\n" + "fmax v15.4s, v15.4s, v28.4s\n" + "fmax v16.4s, v16.4s, v28.4s\n" + "fmax v17.4s, v17.4s, v28.4s\n" + "fmax v18.4s, v18.4s, v28.4s\n" + "fmax v19.4s, v19.4s, v28.4s\n" + "fmax v20.4s, v20.4s, v28.4s\n" + "fmax v21.4s, v21.4s, v28.4s\n" + "fmax v22.4s, v22.4s, v28.4s\n" + "fmax v23.4s, v23.4s, v28.4s\n" + "fmax v24.4s, v24.4s, v28.4s\n" + "fmax v25.4s, v25.4s, v28.4s\n" + "fmax v26.4s, v26.4s, v28.4s\n" + "fmax v27.4s, v27.4s, v28.4s\n" "155:" // Height 5: No activation "cmp x8, #0x10\n" "bge 164f\n" @@ -2961,98 +2960,98 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "180:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 181f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "ldr x28, [x20, #0x28]\n" "cbnz x15, 182f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20, LSL #2\n" + "add x12, x12, x20, LSL #2\n" + "add x11, x11, x20, LSL #2\n" + "add x10, x10, x20, LSL #2\n" "add x9, x9, x20, LSL #2\n" - "add x27, x27, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x21, x21, x20, LSL #2\n" + "add x28, x28, x20, LSL #2\n" "b 182f\n" "181:" // Height 6: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20, LSL #2\n" - "add x27, x9, x20, LSL #2\n" - "add x25, x27, x20, LSL #2\n" - "add x23, x25, x20, LSL #2\n" - "add x21, x23, x20, LSL #2\n" + "add x12, x13, x21, LSL #2\n" + "add x11, x12, x21, LSL #2\n" + "add x10, x11, x21, LSL #2\n" + "add x9, x10, x21, LSL #2\n" + "add x28, x9, x21, LSL #2\n" "182:" // Height 6: input setup done "cmp x14, #0x4\n" "blt 185f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x8\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x21, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x28, #0x0]\n" "ldr q6, [x17, #0x0]\n" "ldr q7, [x17, #0x10]\n" "blt 184f\n" "183:" // Height 6: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr x12, [x17, #0x28]\n" + "ldr x21, [x17, #0x28]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x38]\n" + "ldr x20, [x17, #0x38]\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "add x13, x13, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v28.4s, v6.4s, v5.s[0]\n" "ldr d6, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x48]\n" + "ldr x21, [x17, #0x48]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v29.4s, v7.4s, v5.s[0]\n" "ldr d7, [x17, #0x30]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.4s, v6.4s, v0.s[0]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr x11, [x17, #0x58]\n" + "ldr x20, [x17, #0x58]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr x10, [x13, #0x8]\n" + "ldr x27, [x13, #0x8]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x26, [x12, #0x8]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr x26, [x27, #0x8]\n" + "ldr x25, [x11, #0x8]\n" "fmla v30.4s, v6.4s, v5.s[0]\n" "ldr d6, [x17, #0x40]\n" "fmla v11.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr x12, [x17, #0x68]\n" + "ldr x21, [x17, #0x68]\n" "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x24, [x10, #0x8]\n" "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x23, [x9, #0x8]\n" "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr x20, [x21, #0x8]\n" + "ldr x22, [x28, #0x8]\n" "fmla v31.4s, v7.4s, v5.s[0]\n" "ldr d7, [x17, #0x50]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.4s, v6.4s, v0.s[1]\n" "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x78]\n" + "ldr x20, [x17, #0x78]\n" "fmla v16.4s, v6.4s, v2.s[1]\n" "sub x14, x14, #0x4\n" "fmla v20.4s, v6.4s, v3.s[1]\n" @@ -3062,96 +3061,96 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "fmla v28.4s, v6.4s, v5.s[1]\n" "ldr d6, [x17, #0x60]\n" "fmla v9.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0x88]\n" + "ldr x21, [x17, #0x88]\n" "fmla v17.4s, v7.4s, v2.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v25.4s, v7.4s, v4.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v29.4s, v7.4s, v5.s[1]\n" "ldr d7, [x17, #0x70]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.4s, v6.4s, v0.s[1]\n" "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr x11, [x17, #0x98]\n" + "ldr x20, [x17, #0x98]\n" "fmla v18.4s, v6.4s, v2.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v22.4s, v6.4s, v3.s[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v26.4s, v6.4s, v4.s[1]\n" "fmla v30.4s, v6.4s, v5.s[1]\n" "ldr d6, [x17, #0x80]\n" "fmla v11.4s, v7.4s, v0.s[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr x12, [x17, #0xa8]\n" + "ldr x21, [x17, #0xa8]\n" "fmla v19.4s, v7.4s, v2.s[1]\n" "fmla v23.4s, v7.4s, v3.s[1]\n" "fmla v27.4s, v7.4s, v4.s[1]\n" "fmla v31.4s, v7.4s, v5.s[1]\n" "ldr d7, [x17, #0x90]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.4s, v6.4s, v0.s[2]\n" "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xb8]\n" + "ldr x20, [x17, #0xb8]\n" "fmla v16.4s, v6.4s, v2.s[2]\n" "fmla v20.4s, v6.4s, v3.s[2]\n" "fmla v24.4s, v6.4s, v4.s[2]\n" "fmla v28.4s, v6.4s, v5.s[2]\n" "ldr d6, [x17, #0xa0]\n" "fmla v9.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xc8]\n" + "ldr x21, [x17, #0xc8]\n" "fmla v17.4s, v7.4s, v2.s[2]\n" "fmla v21.4s, v7.4s, v3.s[2]\n" "fmla v25.4s, v7.4s, v4.s[2]\n" "fmla v29.4s, v7.4s, v5.s[2]\n" "ldr d7, [x17, #0xb0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v10.4s, v6.4s, v0.s[2]\n" "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr x11, [x17, #0xd8]\n" + "ldr x20, [x17, #0xd8]\n" "fmla v18.4s, v6.4s, v2.s[2]\n" "fmla v22.4s, v6.4s, v3.s[2]\n" "fmla v26.4s, v6.4s, v4.s[2]\n" "fmla v30.4s, v6.4s, v5.s[2]\n" "ldr d6, [x17, #0xc0]\n" "fmla v11.4s, v7.4s, v0.s[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr x12, [x17, #0xe8]\n" + "ldr x21, [x17, #0xe8]\n" "fmla v19.4s, v7.4s, v2.s[2]\n" "fmla v23.4s, v7.4s, v3.s[2]\n" "fmla v27.4s, v7.4s, v4.s[2]\n" "fmla v31.4s, v7.4s, v5.s[2]\n" "ldr d7, [x17, #0xd0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "fmla v8.4s, v6.4s, v0.s[3]\n" "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0xf8]\n" + "ldr x20, [x17, #0xf8]\n" "fmla v16.4s, v6.4s, v2.s[3]\n" "fmla v20.4s, v6.4s, v3.s[3]\n" "fmla v24.4s, v6.4s, v4.s[3]\n" "fmla v28.4s, v6.4s, v5.s[3]\n" "ldr d6, [x17, #0xe0]\n" "fmla v9.4s, v7.4s, v0.s[3]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" "fmla v13.4s, v7.4s, v1.s[3]\n" "fmla v17.4s, v7.4s, v2.s[3]\n" "fmla v21.4s, v7.4s, v3.s[3]\n" "fmla v25.4s, v7.4s, v4.s[3]\n" "fmla v29.4s, v7.4s, v5.s[3]\n" "ldr d7, [x17, #0xf0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "add x17, x17, #0x100\n" "fmla v10.4s, v6.4s, v0.s[3]\n" - "ldr x12, [x17, #0x8]\n" + "ldr x21, [x17, #0x8]\n" "fmla v14.4s, v6.4s, v1.s[3]\n" - "ldr x11, [x17, #0x18]\n" + "ldr x20, [x17, #0x18]\n" "fmla v18.4s, v6.4s, v2.s[3]\n" "fmla v22.4s, v6.4s, v3.s[3]\n" "fmla v26.4s, v6.4s, v4.s[3]\n" @@ -3160,56 +3159,56 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "fmla v11.4s, v7.4s, v0.s[3]\n" "ldr d0, [x13, #0x0]\n" "fmla v15.4s, v7.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x12, #0x0]\n" "fmla v19.4s, v7.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x11, #0x0]\n" "fmla v23.4s, v7.4s, v3.s[3]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d3, [x10, #0x0]\n" "fmla v27.4s, v7.4s, v4.s[3]\n" - "ldr d4, [x23, #0x0]\n" + "ldr d4, [x9, #0x0]\n" "fmla v31.4s, v7.4s, v5.s[3]\n" - "ldr d5, [x21, #0x0]\n" + "ldr d5, [x28, #0x0]\n" "ldr d7, [x17, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x26\n" + "mov v2.d[1], x25\n" "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v5.d[1], x20\n" - "mov v7.d[1], x11\n" + "mov v4.d[1], x23\n" + "mov v5.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 183b\n" "184:" // Height 6: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "add x13, x13, #0x10\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" "fmla v28.4s, v6.4s, v5.s[0]\n" "ldr q6, [x17, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "sub x14, x14, #0x4\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "prfm pldl1keep, [x13, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v29.4s, v7.4s, v5.s[0]\n" "ldr q7, [x17, #0x30]\n" "fmla v10.4s, v6.4s, v0.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" "fmla v14.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "fmla v18.4s, v6.4s, v2.s[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "fmla v22.4s, v6.4s, v3.s[0]\n" "fmla v26.4s, v6.4s, v4.s[0]\n" "fmla v30.4s, v6.4s, v5.s[0]\n" @@ -3307,42 +3306,42 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "185:" // Height 6: Multiply loop: Main loop skip "cbz x14, 187f\n" "186:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s7, [x13], #0x4\n" "sub x14, x14, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x17, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x17, #0x10]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "fmla v28.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x17, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x17, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" + "ldr s6, [x12], #0x4\n" + "ldr s5, [x11], #0x4\n" + "ldr s4, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr s2, [x28], #0x4\n" + "ldr q1, [x17, #0x0]\n" + "fmla v8.4s, v1.4s, v7.s[0]\n" + "ldr q0, [x17, #0x10]\n" + "fmla v12.4s, v1.4s, v6.s[0]\n" + "fmla v16.4s, v1.4s, v5.s[0]\n" + "fmla v20.4s, v1.4s, v4.s[0]\n" + "fmla v24.4s, v1.4s, v3.s[0]\n" + "fmla v28.4s, v1.4s, v2.s[0]\n" + "ldr q1, [x17, #0x20]\n" + "fmla v9.4s, v0.4s, v7.s[0]\n" + "fmla v13.4s, v0.4s, v6.s[0]\n" + "fmla v17.4s, v0.4s, v5.s[0]\n" + "fmla v21.4s, v0.4s, v4.s[0]\n" + "fmla v25.4s, v0.4s, v3.s[0]\n" + "fmla v29.4s, v0.4s, v2.s[0]\n" + "ldr q0, [x17, #0x30]\n" + "fmla v10.4s, v1.4s, v7.s[0]\n" "add x17, x17, #0x40\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v30.4s, v6.4s, v5.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "fmla v31.4s, v7.4s, v5.s[0]\n" + "fmla v14.4s, v1.4s, v6.s[0]\n" + "fmla v18.4s, v1.4s, v5.s[0]\n" + "fmla v22.4s, v1.4s, v4.s[0]\n" + "fmla v26.4s, v1.4s, v3.s[0]\n" + "fmla v30.4s, v1.4s, v2.s[0]\n" + "fmla v11.4s, v0.4s, v7.s[0]\n" + "fmla v15.4s, v0.4s, v6.s[0]\n" + "fmla v19.4s, v0.4s, v5.s[0]\n" + "fmla v23.4s, v0.4s, v4.s[0]\n" + "fmla v27.4s, v0.4s, v3.s[0]\n" + "fmla v31.4s, v0.4s, v2.s[0]\n" "cbnz x14, 186b\n" "187:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -3584,7 +3583,6 @@ void a64_hybrid_fp32_mla_6x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "200:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp index c5e4388aa9..bb84a50282 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_6x16 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 166f\n" @@ -189,11 +188,11 @@ void a64_hybrid_fp32_mla_6x16 ( "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -210,37 +209,37 @@ void a64_hybrid_fp32_mla_6x16 ( "blt 19f\n" "18:" // Height 1: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr q16, [x10, #0x50]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x10, #0xf0]\n" "sub x27, x27, #0x4\n" "add x26, x26, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x8\n" "add x10, x10, #0x100\n" @@ -250,52 +249,52 @@ void a64_hybrid_fp32_mla_6x16 ( "bge 18b\n" "19:" // Height 1: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "ldr q17, [x10, #0x40]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "ldr q16, [x10, #0x50]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x60]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x4\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "20:" // Height 1: Multiply loop: Main loop skip "cbz x27, 22f\n" "21:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x10, #0x0]\n" + "fmla v8.4s, v16.4s, v18.s[0]\n" "sub x27, x27, #0x1\n" - "ldr q7, [x10, #0x10]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "ldr q17, [x10, #0x10]\n" + "ldr q16, [x10, #0x20]\n" + "fmla v9.4s, v17.4s, v18.s[0]\n" + "fmla v10.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v11.4s, v16.4s, v18.s[0]\n" "add x10, x10, #0x40\n" "cbnz x27, 21b\n" "22:" // Height 1: Multiply loop: No odd multiplies @@ -306,17 +305,17 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pstl1keep, [x9, #0x0]\n" "tbz %x[flags], #1, 23f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" "23:" // Height 1: No activation "cmp x11, #0x10\n" "bge 32f\n" @@ -494,12 +493,12 @@ void a64_hybrid_fp32_mla_6x16 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -507,7 +506,7 @@ void a64_hybrid_fp32_mla_6x16 ( "b 50f\n" "49:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "50:" // Height 2: input setup done "cmp x27, #0x4\n" "blt 53f\n" @@ -520,134 +519,134 @@ void a64_hybrid_fp32_mla_6x16 ( "51:" // Height 2: Multiply loop: Main loop head "fmla v8.4s, v6.4s, v0.s[0]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "sub x27, x27, #0x4\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x26, x26, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr q17, [x10, #0x40]\n" "add x25, x25, #0x10\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr q16, [x10, #0x50]\n" "cmp x27, #0x8\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x60]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x10, #0x70]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x10, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only "fmla v8.4s, v6.4s, v0.s[0]\n" "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "add x26, x26, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x25, x25, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v10.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v17.4s, v1.s[0]\n" + "ldr q17, [x10, #0x40]\n" "sub x27, x27, #0x4\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v11.4s, v16.4s, v0.s[0]\n" + "fmla v15.4s, v16.4s, v1.s[0]\n" + "ldr q16, [x10, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x60]\n" + "fmla v8.4s, v17.4s, v0.s[1]\n" + "fmla v12.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v9.4s, v16.4s, v0.s[1]\n" + "fmla v13.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x10, #0x70]\n" + "fmla v10.4s, v17.4s, v0.s[1]\n" + "fmla v14.4s, v17.4s, v1.s[1]\n" + "ldr q17, [x10, #0x80]\n" + "fmla v11.4s, v16.4s, v0.s[1]\n" + "fmla v15.4s, v16.4s, v1.s[1]\n" + "ldr q16, [x10, #0x90]\n" + "fmla v8.4s, v17.4s, v0.s[2]\n" + "fmla v12.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0xa0]\n" + "fmla v9.4s, v16.4s, v0.s[2]\n" + "fmla v13.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x10, #0xb0]\n" + "fmla v10.4s, v17.4s, v0.s[2]\n" + "fmla v14.4s, v17.4s, v1.s[2]\n" + "ldr q17, [x10, #0xc0]\n" + "fmla v11.4s, v16.4s, v0.s[2]\n" + "fmla v15.4s, v16.4s, v1.s[2]\n" + "ldr q16, [x10, #0xd0]\n" + "fmla v8.4s, v17.4s, v0.s[3]\n" + "fmla v12.4s, v17.4s, v1.s[3]\n" + "ldr q17, [x10, #0xe0]\n" + "fmla v9.4s, v16.4s, v0.s[3]\n" + "fmla v13.4s, v16.4s, v1.s[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v10.4s, v17.4s, v0.s[3]\n" + "fmla v14.4s, v17.4s, v1.s[3]\n" + "fmla v11.4s, v16.4s, v0.s[3]\n" + "fmla v15.4s, v16.4s, v1.s[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x27, 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "ldr q7, [x10, #0x30]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + "fmla v8.4s, v17.4s, v19.s[0]\n" + "fmla v12.4s, v17.4s, v18.s[0]\n" + "ldr q17, [x10, #0x20]\n" + "fmla v9.4s, v16.4s, v19.s[0]\n" + "fmla v13.4s, v16.4s, v18.s[0]\n" + "ldr q16, [x10, #0x30]\n" + "fmla v10.4s, v17.4s, v19.s[0]\n" + "fmla v14.4s, v17.4s, v18.s[0]\n" "add x10, x10, #0x40\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v11.4s, v16.4s, v19.s[0]\n" + "fmla v15.4s, v16.4s, v18.s[0]\n" "cbnz x27, 54b\n" "55:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -660,25 +659,25 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pstl1keep, [x25, #0x0]\n" "tbz %x[flags], #1, 56f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v17.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" + "ld1r { v16.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v17.4s\n" + "fmin v9.4s, v9.4s, v17.4s\n" + "fmin v10.4s, v10.4s, v17.4s\n" + "fmin v11.4s, v11.4s, v17.4s\n" + "fmin v12.4s, v12.4s, v17.4s\n" + "fmin v13.4s, v13.4s, v17.4s\n" + "fmin v14.4s, v14.4s, v17.4s\n" + "fmin v15.4s, v15.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v16.4s\n" + "fmax v9.4s, v9.4s, v16.4s\n" + "fmax v10.4s, v10.4s, v16.4s\n" + "fmax v11.4s, v11.4s, v16.4s\n" + "fmax v12.4s, v12.4s, v16.4s\n" + "fmax v13.4s, v13.4s, v16.4s\n" + "fmax v14.4s, v14.4s, v16.4s\n" + "fmax v15.4s, v15.4s, v16.4s\n" "56:" // Height 2: No activation "cmp x11, #0x10\n" "bge 65f\n" @@ -905,13 +904,13 @@ void a64_hybrid_fp32_mla_6x16 ( "81:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 82f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 83f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -920,8 +919,8 @@ void a64_hybrid_fp32_mla_6x16 ( "b 83f\n" "82:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "83:" // Height 3: input setup done "cmp x27, #0x4\n" "blt 86f\n" @@ -938,75 +937,75 @@ void a64_hybrid_fp32_mla_6x16 ( "sub x27, x27, #0x4\n" "add x26, x26, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x25, x25, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "add x24, x24, #0x10\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" "cmp x27, #0x8\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x10, #0x50]\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr q20, [x10, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x10, #0x70]\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x80]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x10, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x10, #0xb0]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0xc0]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x10, #0xd0]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr q21, [x10, #0xe0]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 84b\n" @@ -1016,95 +1015,95 @@ void a64_hybrid_fp32_mla_6x16 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x24, x24, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "sub x27, x27, #0x4\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v21.4s, v0.s[0]\n" + "fmla v14.4s, v21.4s, v1.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" + "fmla v18.4s, v21.4s, v2.s[0]\n" + "ldr q21, [x10, #0x40]\n" + "fmla v11.4s, v20.4s, v0.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v15.4s, v20.4s, v1.s[0]\n" + "fmla v19.4s, v20.4s, v2.s[0]\n" + "ldr q20, [x10, #0x50]\n" + "fmla v8.4s, v21.4s, v0.s[1]\n" + "fmla v12.4s, v21.4s, v1.s[1]\n" + "fmla v16.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x60]\n" + "fmla v9.4s, v20.4s, v0.s[1]\n" + "fmla v13.4s, v20.4s, v1.s[1]\n" + "fmla v17.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x10, #0x70]\n" + "fmla v10.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v2.s[1]\n" + "ldr q21, [x10, #0x80]\n" + "fmla v11.4s, v20.4s, v0.s[1]\n" + "fmla v15.4s, v20.4s, v1.s[1]\n" + "fmla v19.4s, v20.4s, v2.s[1]\n" + "ldr q20, [x10, #0x90]\n" + "fmla v8.4s, v21.4s, v0.s[2]\n" + "fmla v12.4s, v21.4s, v1.s[2]\n" + "fmla v16.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0xa0]\n" + "fmla v9.4s, v20.4s, v0.s[2]\n" + "fmla v13.4s, v20.4s, v1.s[2]\n" + "fmla v17.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x10, #0xb0]\n" + "fmla v10.4s, v21.4s, v0.s[2]\n" + "fmla v14.4s, v21.4s, v1.s[2]\n" + "fmla v18.4s, v21.4s, v2.s[2]\n" + "ldr q21, [x10, #0xc0]\n" + "fmla v11.4s, v20.4s, v0.s[2]\n" + "fmla v15.4s, v20.4s, v1.s[2]\n" + "fmla v19.4s, v20.4s, v2.s[2]\n" + "ldr q20, [x10, #0xd0]\n" + "fmla v8.4s, v21.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v1.s[3]\n" + "fmla v16.4s, v21.4s, v2.s[3]\n" + "ldr q21, [x10, #0xe0]\n" + "fmla v9.4s, v20.4s, v0.s[3]\n" + "fmla v13.4s, v20.4s, v1.s[3]\n" + "fmla v17.4s, v20.4s, v2.s[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v10.4s, v21.4s, v0.s[3]\n" + "fmla v14.4s, v21.4s, v1.s[3]\n" + "fmla v18.4s, v21.4s, v2.s[3]\n" + "fmla v11.4s, v20.4s, v0.s[3]\n" + "fmla v15.4s, v20.4s, v1.s[3]\n" + "fmla v19.4s, v20.4s, v2.s[3]\n" "86:" // Height 3: Multiply loop: Main loop skip "cbz x27, 88f\n" "87:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x10, #0x0]\n" + "fmla v8.4s, v21.4s, v24.s[0]\n" + "fmla v12.4s, v21.4s, v23.s[0]\n" + "ldr q20, [x10, #0x10]\n" + "fmla v16.4s, v21.4s, v22.s[0]\n" + "ldr q21, [x10, #0x20]\n" + "fmla v9.4s, v20.4s, v24.s[0]\n" + "fmla v13.4s, v20.4s, v23.s[0]\n" + "fmla v17.4s, v20.4s, v22.s[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" + "fmla v10.4s, v21.4s, v24.s[0]\n" + "fmla v14.4s, v21.4s, v23.s[0]\n" + "fmla v18.4s, v21.4s, v22.s[0]\n" + "fmla v11.4s, v20.4s, v24.s[0]\n" + "fmla v15.4s, v20.4s, v23.s[0]\n" + "fmla v19.4s, v20.4s, v22.s[0]\n" "cbnz x27, 87b\n" "88:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1119,33 +1118,33 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pstl1keep, [x24, #0x0]\n" "tbz %x[flags], #1, 89f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v21.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v20.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v21.4s\n" + "fmin v9.4s, v9.4s, v21.4s\n" + "fmin v10.4s, v10.4s, v21.4s\n" + "fmin v11.4s, v11.4s, v21.4s\n" + "fmin v12.4s, v12.4s, v21.4s\n" + "fmin v13.4s, v13.4s, v21.4s\n" + "fmin v14.4s, v14.4s, v21.4s\n" + "fmin v15.4s, v15.4s, v21.4s\n" + "fmin v16.4s, v16.4s, v21.4s\n" + "fmin v17.4s, v17.4s, v21.4s\n" + "fmin v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v20.4s\n" + "fmax v9.4s, v9.4s, v20.4s\n" + "fmax v10.4s, v10.4s, v20.4s\n" + "fmax v11.4s, v11.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v20.4s\n" + "fmax v13.4s, v13.4s, v20.4s\n" + "fmax v14.4s, v14.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v20.4s\n" + "fmax v16.4s, v16.4s, v20.4s\n" + "fmax v17.4s, v17.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v20.4s\n" + "fmax v19.4s, v19.4s, v20.4s\n" "89:" // Height 3: No activation "cmp x11, #0x10\n" "bge 98f\n" @@ -1421,14 +1420,14 @@ void a64_hybrid_fp32_mla_6x16 ( "114:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 115f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 116f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1438,9 +1437,9 @@ void a64_hybrid_fp32_mla_6x16 ( "b 116f\n" "115:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "116:" // Height 4: input setup done "cmp x27, #0x4\n" "blt 119f\n" @@ -1459,7 +1458,7 @@ void a64_hybrid_fp32_mla_6x16 ( "add x26, x26, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x25, x25, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -1467,85 +1466,85 @@ void a64_hybrid_fp32_mla_6x16 ( "add x23, x23, #0x10\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "cmp x27, #0x8\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr q24, [x10, #0x50]\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x10, #0x70]\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x80]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x10, #0x90]\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0xa0]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x10, #0xb0]\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0xc0]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x10, #0xd0]\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr q25, [x10, #0xe0]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 117b\n" @@ -1556,7 +1555,7 @@ void a64_hybrid_fp32_mla_6x16 ( "add x25, x25, #0x10\n" "fmla v16.4s, v6.4s, v2.s[0]\n" "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x24, x24, #0x10\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -1564,109 +1563,109 @@ void a64_hybrid_fp32_mla_6x16 ( "sub x27, x27, #0x4\n" "fmla v17.4s, v7.4s, v2.s[0]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v25.4s, v0.s[0]\n" + "fmla v14.4s, v25.4s, v1.s[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x40]\n" + "fmla v18.4s, v25.4s, v2.s[0]\n" + "fmla v22.4s, v25.4s, v3.s[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v11.4s, v24.4s, v0.s[0]\n" + "fmla v15.4s, v24.4s, v1.s[0]\n" + "fmla v19.4s, v24.4s, v2.s[0]\n" + "fmla v23.4s, v24.4s, v3.s[0]\n" + "ldr q24, [x10, #0x50]\n" + "fmla v8.4s, v25.4s, v0.s[1]\n" + "fmla v12.4s, v25.4s, v1.s[1]\n" + "fmla v16.4s, v25.4s, v2.s[1]\n" + "fmla v20.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x60]\n" + "fmla v9.4s, v24.4s, v0.s[1]\n" + "fmla v13.4s, v24.4s, v1.s[1]\n" + "fmla v17.4s, v24.4s, v2.s[1]\n" + "fmla v21.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x10, #0x70]\n" + "fmla v10.4s, v25.4s, v0.s[1]\n" + "fmla v14.4s, v25.4s, v1.s[1]\n" + "fmla v18.4s, v25.4s, v2.s[1]\n" + "fmla v22.4s, v25.4s, v3.s[1]\n" + "ldr q25, [x10, #0x80]\n" + "fmla v11.4s, v24.4s, v0.s[1]\n" + "fmla v15.4s, v24.4s, v1.s[1]\n" + "fmla v19.4s, v24.4s, v2.s[1]\n" + "fmla v23.4s, v24.4s, v3.s[1]\n" + "ldr q24, [x10, #0x90]\n" + "fmla v8.4s, v25.4s, v0.s[2]\n" + "fmla v12.4s, v25.4s, v1.s[2]\n" + "fmla v16.4s, v25.4s, v2.s[2]\n" + "fmla v20.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0xa0]\n" + "fmla v9.4s, v24.4s, v0.s[2]\n" + "fmla v13.4s, v24.4s, v1.s[2]\n" + "fmla v17.4s, v24.4s, v2.s[2]\n" + "fmla v21.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x10, #0xb0]\n" + "fmla v10.4s, v25.4s, v0.s[2]\n" + "fmla v14.4s, v25.4s, v1.s[2]\n" + "fmla v18.4s, v25.4s, v2.s[2]\n" + "fmla v22.4s, v25.4s, v3.s[2]\n" + "ldr q25, [x10, #0xc0]\n" + "fmla v11.4s, v24.4s, v0.s[2]\n" + "fmla v15.4s, v24.4s, v1.s[2]\n" + "fmla v19.4s, v24.4s, v2.s[2]\n" + "fmla v23.4s, v24.4s, v3.s[2]\n" + "ldr q24, [x10, #0xd0]\n" + "fmla v8.4s, v25.4s, v0.s[3]\n" + "fmla v12.4s, v25.4s, v1.s[3]\n" + "fmla v16.4s, v25.4s, v2.s[3]\n" + "fmla v20.4s, v25.4s, v3.s[3]\n" + "ldr q25, [x10, #0xe0]\n" + "fmla v9.4s, v24.4s, v0.s[3]\n" + "fmla v13.4s, v24.4s, v1.s[3]\n" + "fmla v17.4s, v24.4s, v2.s[3]\n" + "fmla v21.4s, v24.4s, v3.s[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v10.4s, v25.4s, v0.s[3]\n" + "fmla v14.4s, v25.4s, v1.s[3]\n" + "fmla v18.4s, v25.4s, v2.s[3]\n" + "fmla v22.4s, v25.4s, v3.s[3]\n" + "fmla v11.4s, v24.4s, v0.s[3]\n" + "fmla v15.4s, v24.4s, v1.s[3]\n" + "fmla v19.4s, v24.4s, v2.s[3]\n" + "fmla v23.4s, v24.4s, v3.s[3]\n" "119:" // Height 4: Multiply loop: Main loop skip "cbz x27, 121f\n" "120:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + "fmla v8.4s, v25.4s, v29.s[0]\n" + "fmla v12.4s, v25.4s, v28.s[0]\n" + "fmla v16.4s, v25.4s, v27.s[0]\n" + "fmla v20.4s, v25.4s, v26.s[0]\n" + "ldr q25, [x10, #0x20]\n" + "fmla v9.4s, v24.4s, v29.s[0]\n" + "fmla v13.4s, v24.4s, v28.s[0]\n" + "fmla v17.4s, v24.4s, v27.s[0]\n" + "fmla v21.4s, v24.4s, v26.s[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" + "fmla v10.4s, v25.4s, v29.s[0]\n" + "fmla v14.4s, v25.4s, v28.s[0]\n" + "fmla v18.4s, v25.4s, v27.s[0]\n" + "fmla v22.4s, v25.4s, v26.s[0]\n" + "fmla v11.4s, v24.4s, v29.s[0]\n" + "fmla v15.4s, v24.4s, v28.s[0]\n" + "fmla v19.4s, v24.4s, v27.s[0]\n" + "fmla v23.4s, v24.4s, v26.s[0]\n" "cbnz x27, 120b\n" "121:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1683,41 +1682,41 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pstl1keep, [x23, #0x0]\n" "tbz %x[flags], #1, 122f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v25.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" + "ld1r { v24.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v25.4s\n" + "fmin v9.4s, v9.4s, v25.4s\n" + "fmin v10.4s, v10.4s, v25.4s\n" + "fmin v11.4s, v11.4s, v25.4s\n" + "fmin v12.4s, v12.4s, v25.4s\n" + "fmin v13.4s, v13.4s, v25.4s\n" + "fmin v14.4s, v14.4s, v25.4s\n" + "fmin v15.4s, v15.4s, v25.4s\n" + "fmin v16.4s, v16.4s, v25.4s\n" + "fmin v17.4s, v17.4s, v25.4s\n" + "fmin v18.4s, v18.4s, v25.4s\n" + "fmin v19.4s, v19.4s, v25.4s\n" + "fmin v20.4s, v20.4s, v25.4s\n" + "fmin v21.4s, v21.4s, v25.4s\n" + "fmin v22.4s, v22.4s, v25.4s\n" + "fmin v23.4s, v23.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v24.4s\n" + "fmax v9.4s, v9.4s, v24.4s\n" + "fmax v10.4s, v10.4s, v24.4s\n" + "fmax v11.4s, v11.4s, v24.4s\n" + "fmax v12.4s, v12.4s, v24.4s\n" + "fmax v13.4s, v13.4s, v24.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "fmax v23.4s, v23.4s, v24.4s\n" "122:" // Height 4: No activation "cmp x11, #0x10\n" "bge 131f\n" @@ -2028,168 +2027,168 @@ void a64_hybrid_fp32_mla_6x16 ( "movi v16.16b, #0x0\n" "movi v17.16b, #0x0\n" "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "146:" // Height 5: setup done - "mov x28, #0x0\n" - "147:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 148f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 149f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x22, x22, x20, LSL #2\n" - "b 149f\n" - "148:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "149:" // Height 5: input setup done - "cmp x27, #0x4\n" - "blt 152f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "blt 151f\n" - "150:" // Height 5: Multiply loop: Main loop head - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "add x23, x23, #0x10\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x8\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "146:" // Height 5: setup done + "mov x28, #0x0\n" + "147:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 148f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 149f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #2\n" + "add x25, x25, x20, LSL #2\n" + "add x24, x24, x20, LSL #2\n" + "add x23, x23, x20, LSL #2\n" + "add x22, x22, x20, LSL #2\n" + "b 149f\n" + "148:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "149:" // Height 5: input setup done + "cmp x27, #0x4\n" + "blt 152f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x8\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 151f\n" + "150:" // Height 5: Multiply loop: Main loop head + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "sub x27, x27, #0x4\n" + "add x26, x26, #0x10\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "add x23, x23, #0x10\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x8\n" + "fmla v21.4s, v7.4s, v3.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[0]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr q28, [x10, #0x50]\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x10, #0x70]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x80]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x10, #0x90]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0xa0]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x10, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x10, #0xd0]\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr q29, [x10, #0xe0]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" "ldr q6, [x10, #0x0]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" "ldr q0, [x26, #0x0]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" "ldr q1, [x25, #0x0]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" "ldr q2, [x24, #0x0]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" "ldr q3, [x23, #0x0]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 150b\n" @@ -2203,7 +2202,7 @@ void a64_hybrid_fp32_mla_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q29, [x10, #0x20]\n" "fmla v9.4s, v7.4s, v0.s[0]\n" "add x22, x22, #0x10\n" "fmla v13.4s, v7.4s, v1.s[0]\n" @@ -2212,128 +2211,128 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "fmla v21.4s, v7.4s, v3.s[0]\n" "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q28, [x10, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" + "fmla v10.4s, v29.4s, v0.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" + "fmla v18.4s, v29.4s, v2.s[0]\n" + "fmla v22.4s, v29.4s, v3.s[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x40]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x50]\n" - "fmla v8.4s, v6.4s, v0.s[1]\n" - "fmla v12.4s, v6.4s, v1.s[1]\n" - "fmla v16.4s, v6.4s, v2.s[1]\n" - "fmla v20.4s, v6.4s, v3.s[1]\n" - "fmla v24.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x60]\n" - "fmla v9.4s, v7.4s, v0.s[1]\n" - "fmla v13.4s, v7.4s, v1.s[1]\n" - "fmla v17.4s, v7.4s, v2.s[1]\n" - "fmla v21.4s, v7.4s, v3.s[1]\n" - "fmla v25.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x10, #0x70]\n" - "fmla v10.4s, v6.4s, v0.s[1]\n" - "fmla v14.4s, v6.4s, v1.s[1]\n" - "fmla v18.4s, v6.4s, v2.s[1]\n" - "fmla v22.4s, v6.4s, v3.s[1]\n" - "fmla v26.4s, v6.4s, v4.s[1]\n" - "ldr q6, [x10, #0x80]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v1.s[1]\n" - "fmla v19.4s, v7.4s, v2.s[1]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "fmla v27.4s, v7.4s, v4.s[1]\n" - "ldr q7, [x10, #0x90]\n" - "fmla v8.4s, v6.4s, v0.s[2]\n" - "fmla v12.4s, v6.4s, v1.s[2]\n" - "fmla v16.4s, v6.4s, v2.s[2]\n" - "fmla v20.4s, v6.4s, v3.s[2]\n" - "fmla v24.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[2]\n" - "fmla v13.4s, v7.4s, v1.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[2]\n" - "fmla v21.4s, v7.4s, v3.s[2]\n" - "fmla v25.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x10, #0xb0]\n" - "fmla v10.4s, v6.4s, v0.s[2]\n" - "fmla v14.4s, v6.4s, v1.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[2]\n" - "fmla v22.4s, v6.4s, v3.s[2]\n" - "fmla v26.4s, v6.4s, v4.s[2]\n" - "ldr q6, [x10, #0xc0]\n" - "fmla v11.4s, v7.4s, v0.s[2]\n" - "fmla v15.4s, v7.4s, v1.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[2]\n" - "fmla v23.4s, v7.4s, v3.s[2]\n" - "fmla v27.4s, v7.4s, v4.s[2]\n" - "ldr q7, [x10, #0xd0]\n" - "fmla v8.4s, v6.4s, v0.s[3]\n" - "fmla v12.4s, v6.4s, v1.s[3]\n" - "fmla v16.4s, v6.4s, v2.s[3]\n" - "fmla v20.4s, v6.4s, v3.s[3]\n" - "fmla v24.4s, v6.4s, v4.s[3]\n" - "ldr q6, [x10, #0xe0]\n" - "fmla v9.4s, v7.4s, v0.s[3]\n" - "fmla v13.4s, v7.4s, v1.s[3]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "fmla v21.4s, v7.4s, v3.s[3]\n" - "fmla v25.4s, v7.4s, v4.s[3]\n" - "ldr q7, [x10, #0xf0]\n" + "fmla v26.4s, v29.4s, v4.s[0]\n" + "ldr q29, [x10, #0x40]\n" + "fmla v11.4s, v28.4s, v0.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v2.s[0]\n" + "fmla v23.4s, v28.4s, v3.s[0]\n" + "fmla v27.4s, v28.4s, v4.s[0]\n" + "ldr q28, [x10, #0x50]\n" + "fmla v8.4s, v29.4s, v0.s[1]\n" + "fmla v12.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[1]\n" + "fmla v20.4s, v29.4s, v3.s[1]\n" + "fmla v24.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x60]\n" + "fmla v9.4s, v28.4s, v0.s[1]\n" + "fmla v13.4s, v28.4s, v1.s[1]\n" + "fmla v17.4s, v28.4s, v2.s[1]\n" + "fmla v21.4s, v28.4s, v3.s[1]\n" + "fmla v25.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x10, #0x70]\n" + "fmla v10.4s, v29.4s, v0.s[1]\n" + "fmla v14.4s, v29.4s, v1.s[1]\n" + "fmla v18.4s, v29.4s, v2.s[1]\n" + "fmla v22.4s, v29.4s, v3.s[1]\n" + "fmla v26.4s, v29.4s, v4.s[1]\n" + "ldr q29, [x10, #0x80]\n" + "fmla v11.4s, v28.4s, v0.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[1]\n" + "fmla v19.4s, v28.4s, v2.s[1]\n" + "fmla v23.4s, v28.4s, v3.s[1]\n" + "fmla v27.4s, v28.4s, v4.s[1]\n" + "ldr q28, [x10, #0x90]\n" + "fmla v8.4s, v29.4s, v0.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[2]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[2]\n" + "fmla v24.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0xa0]\n" + "fmla v9.4s, v28.4s, v0.s[2]\n" + "fmla v13.4s, v28.4s, v1.s[2]\n" + "fmla v17.4s, v28.4s, v2.s[2]\n" + "fmla v21.4s, v28.4s, v3.s[2]\n" + "fmla v25.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x10, #0xb0]\n" + "fmla v10.4s, v29.4s, v0.s[2]\n" + "fmla v14.4s, v29.4s, v1.s[2]\n" + "fmla v18.4s, v29.4s, v2.s[2]\n" + "fmla v22.4s, v29.4s, v3.s[2]\n" + "fmla v26.4s, v29.4s, v4.s[2]\n" + "ldr q29, [x10, #0xc0]\n" + "fmla v11.4s, v28.4s, v0.s[2]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v19.4s, v28.4s, v2.s[2]\n" + "fmla v23.4s, v28.4s, v3.s[2]\n" + "fmla v27.4s, v28.4s, v4.s[2]\n" + "ldr q28, [x10, #0xd0]\n" + "fmla v8.4s, v29.4s, v0.s[3]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + "fmla v16.4s, v29.4s, v2.s[3]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "fmla v24.4s, v29.4s, v4.s[3]\n" + "ldr q29, [x10, #0xe0]\n" + "fmla v9.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[3]\n" + "fmla v17.4s, v28.4s, v2.s[3]\n" + "fmla v21.4s, v28.4s, v3.s[3]\n" + "fmla v25.4s, v28.4s, v4.s[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - "fmla v10.4s, v6.4s, v0.s[3]\n" - "fmla v14.4s, v6.4s, v1.s[3]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v22.4s, v6.4s, v3.s[3]\n" - "fmla v26.4s, v6.4s, v4.s[3]\n" - "fmla v11.4s, v7.4s, v0.s[3]\n" - "fmla v15.4s, v7.4s, v1.s[3]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v23.4s, v7.4s, v3.s[3]\n" - "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v10.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[3]\n" + "fmla v18.4s, v29.4s, v2.s[3]\n" + "fmla v22.4s, v29.4s, v3.s[3]\n" + "fmla v26.4s, v29.4s, v4.s[3]\n" + "fmla v11.4s, v28.4s, v0.s[3]\n" + "fmla v15.4s, v28.4s, v1.s[3]\n" + "fmla v19.4s, v28.4s, v2.s[3]\n" + "fmla v23.4s, v28.4s, v3.s[3]\n" + "fmla v27.4s, v28.4s, v4.s[3]\n" "152:" // Height 5: Multiply loop: Main loop skip "cbz x27, 154f\n" "153:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x10, #0x0]\n" + "fmla v8.4s, v29.4s, v2.s[0]\n" + "fmla v12.4s, v29.4s, v1.s[0]\n" + "ldr q28, [x10, #0x10]\n" + "fmla v16.4s, v29.4s, v0.s[0]\n" + "fmla v20.4s, v29.4s, v31.s[0]\n" + "fmla v24.4s, v29.4s, v30.s[0]\n" + "ldr q29, [x10, #0x20]\n" + "fmla v9.4s, v28.4s, v2.s[0]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v17.4s, v28.4s, v0.s[0]\n" + "fmla v21.4s, v28.4s, v31.s[0]\n" + "fmla v25.4s, v28.4s, v30.s[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" + "fmla v10.4s, v29.4s, v2.s[0]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v18.4s, v29.4s, v0.s[0]\n" + "fmla v22.4s, v29.4s, v31.s[0]\n" + "fmla v26.4s, v29.4s, v30.s[0]\n" + "fmla v11.4s, v28.4s, v2.s[0]\n" + "fmla v15.4s, v28.4s, v1.s[0]\n" + "fmla v19.4s, v28.4s, v0.s[0]\n" + "fmla v23.4s, v28.4s, v31.s[0]\n" + "fmla v27.4s, v28.4s, v30.s[0]\n" "cbnz x27, 153b\n" "154:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2352,49 +2351,49 @@ void a64_hybrid_fp32_mla_6x16 ( "prfm pstl1keep, [x22, #0x0]\n" "tbz %x[flags], #1, 155f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v29.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmin v26.4s, v26.4s, v1.4s\n" - "fmin v27.4s, v27.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" + "ld1r { v28.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v29.4s\n" + "fmin v9.4s, v9.4s, v29.4s\n" + "fmin v10.4s, v10.4s, v29.4s\n" + "fmin v11.4s, v11.4s, v29.4s\n" + "fmin v12.4s, v12.4s, v29.4s\n" + "fmin v13.4s, v13.4s, v29.4s\n" + "fmin v14.4s, v14.4s, v29.4s\n" + "fmin v15.4s, v15.4s, v29.4s\n" + "fmin v16.4s, v16.4s, v29.4s\n" + "fmin v17.4s, v17.4s, v29.4s\n" + "fmin v18.4s, v18.4s, v29.4s\n" + "fmin v19.4s, v19.4s, v29.4s\n" + "fmin v20.4s, v20.4s, v29.4s\n" + "fmin v21.4s, v21.4s, v29.4s\n" + "fmin v22.4s, v22.4s, v29.4s\n" + "fmin v23.4s, v23.4s, v29.4s\n" + "fmin v24.4s, v24.4s, v29.4s\n" + "fmin v25.4s, v25.4s, v29.4s\n" + "fmin v26.4s, v26.4s, v29.4s\n" + "fmin v27.4s, v27.4s, v29.4s\n" + "fmax v8.4s, v8.4s, v28.4s\n" + "fmax v9.4s, v9.4s, v28.4s\n" + "fmax v10.4s, v10.4s, v28.4s\n" + "fmax v11.4s, v11.4s, v28.4s\n" + "fmax v12.4s, v12.4s, v28.4s\n" + "fmax v13.4s, v13.4s, v28.4s\n" + "fmax v14.4s, v14.4s, v28.4s\n" + "fmax v15.4s, v15.4s, v28.4s\n" + "fmax v16.4s, v16.4s, v28.4s\n" + "fmax v17.4s, v17.4s, v28.4s\n" + "fmax v18.4s, v18.4s, v28.4s\n" + "fmax v19.4s, v19.4s, v28.4s\n" + "fmax v20.4s, v20.4s, v28.4s\n" + "fmax v21.4s, v21.4s, v28.4s\n" + "fmax v22.4s, v22.4s, v28.4s\n" + "fmax v23.4s, v23.4s, v28.4s\n" + "fmax v24.4s, v24.4s, v28.4s\n" + "fmax v25.4s, v25.4s, v28.4s\n" + "fmax v26.4s, v26.4s, v28.4s\n" + "fmax v27.4s, v27.4s, v28.4s\n" "155:" // Height 5: No activation "cmp x11, #0x10\n" "bge 164f\n" @@ -2771,16 +2770,16 @@ void a64_hybrid_fp32_mla_6x16 ( "180:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 181f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 182f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -2792,11 +2791,11 @@ void a64_hybrid_fp32_mla_6x16 ( "b 182f\n" "181:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "182:" // Height 6: input setup done "cmp x27, #0x4\n" "blt 185f\n" @@ -3073,42 +3072,42 @@ void a64_hybrid_fp32_mla_6x16 ( "185:" // Height 6: Multiply loop: Main loop skip "cbz x27, 187f\n" "186:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x1\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v1.s[0]\n" - "fmla v16.4s, v6.4s, v2.s[0]\n" - "fmla v20.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v4.s[0]\n" - "fmla v28.4s, v6.4s, v5.s[0]\n" - "ldr q6, [x10, #0x20]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v1.s[0]\n" - "fmla v17.4s, v7.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v4.s[0]\n" - "fmla v29.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + "fmla v8.4s, v1.4s, v7.s[0]\n" + "fmla v12.4s, v1.4s, v6.s[0]\n" + "fmla v16.4s, v1.4s, v5.s[0]\n" + "fmla v20.4s, v1.4s, v4.s[0]\n" + "fmla v24.4s, v1.4s, v3.s[0]\n" + "fmla v28.4s, v1.4s, v2.s[0]\n" + "ldr q1, [x10, #0x20]\n" + "fmla v9.4s, v0.4s, v7.s[0]\n" + "fmla v13.4s, v0.4s, v6.s[0]\n" + "fmla v17.4s, v0.4s, v5.s[0]\n" + "fmla v21.4s, v0.4s, v4.s[0]\n" + "fmla v25.4s, v0.4s, v3.s[0]\n" + "fmla v29.4s, v0.4s, v2.s[0]\n" + "ldr q0, [x10, #0x30]\n" "add x10, x10, #0x40\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v14.4s, v6.4s, v1.s[0]\n" - "fmla v18.4s, v6.4s, v2.s[0]\n" - "fmla v22.4s, v6.4s, v3.s[0]\n" - "fmla v26.4s, v6.4s, v4.s[0]\n" - "fmla v30.4s, v6.4s, v5.s[0]\n" - "fmla v11.4s, v7.4s, v0.s[0]\n" - "fmla v15.4s, v7.4s, v1.s[0]\n" - "fmla v19.4s, v7.4s, v2.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[0]\n" - "fmla v27.4s, v7.4s, v4.s[0]\n" - "fmla v31.4s, v7.4s, v5.s[0]\n" + "fmla v10.4s, v1.4s, v7.s[0]\n" + "fmla v14.4s, v1.4s, v6.s[0]\n" + "fmla v18.4s, v1.4s, v5.s[0]\n" + "fmla v22.4s, v1.4s, v4.s[0]\n" + "fmla v26.4s, v1.4s, v3.s[0]\n" + "fmla v30.4s, v1.4s, v2.s[0]\n" + "fmla v11.4s, v0.4s, v7.s[0]\n" + "fmla v15.4s, v0.4s, v6.s[0]\n" + "fmla v19.4s, v0.4s, v5.s[0]\n" + "fmla v23.4s, v0.4s, v4.s[0]\n" + "fmla v27.4s, v0.4s, v3.s[0]\n" + "fmla v31.4s, v0.4s, v2.s[0]\n" "cbnz x27, 186b\n" "187:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -3350,7 +3349,6 @@ void a64_hybrid_fp32_mla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "200:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp index 4fad58a83d..3ec02395d1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #define ARGLIST \ @@ -90,5 +90,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp index 67e0c1e8cc..236865315e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, 2023 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4_a55 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x8\n" "bge 148f\n" @@ -105,563 +104,563 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "cmp %x[M], #0x2\n" "bgt 43f\n" "beq 22f\n" - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x15, %x[bias]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "2:" // Height 1: Column loop - "cbz x15, 3f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 3f\n" + "ldr q24, [x3, #0x0]\n" + "add x3, x3, #0x10\n" "b 8f\n" "3:" // Height 1: no bias "tbz %x[flags], #0, 7f\n" - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 6f\n" - "tbz x17, #1, 4f\n" - "ldr d24, [x14], #0x8\n" - "mov x8, #0x8\n" - "tbz x17, #0, 5f\n" - "ld1 { v24.s }[2], [x14]\n" + "tbz x4, #1, 4f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "tbz x4, #0, 5f\n" + "ld1 { v24.s }[2], [x6]\n" "b 5f\n" "4:" // Height 1: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" "5:" // Height 1: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 8f\n" "6:" // Height 1: full accumulate - "ldr q24, [x14, #0x0]\n" + "ldr q24, [x6, #0x0]\n" "b 8f\n" "7:" // Height 1: no accumulate "movi v24.16b, #0x0\n" "8:" // Height 1: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "9:" // Height 1: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 10f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "cbnz x13, 11f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "cbnz x7, 11f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" "b 11f\n" "10:" // Height 1: setup direct input - "mov x11, %x[input_ptr]\n" + "mov x17, %x[input_ptr]\n" "11:" // Height 1: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 14f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q8, [x16, #0x0]\n" - "cmp x12, #0x8\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 13f\n" "12:" // Height 1: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" - "ldr x8, [x16, #0x18]\n" - "add x11, x11, #0x10\n" - "ldr d10, [x16, #0x20]\n" - "sub x12, x12, #0x4\n" - "ldr x21, [x16, #0x28]\n" - "cmp x12, #0x8\n" - "mov v9.d[1], x8\n" - "ldr d11, [x16, #0x30]\n" - "ldr x8, [x16, #0x38]\n" - "add x16, x16, #0x40\n" + "add x17, x17, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" - "prfm pldl1keep, [x11, #0x80]\n" - "mov v11.d[1], x8\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "ldr x10, [x11, #0x8]\n" - "mov v8.d[1], x26\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" - "mov v0.d[1], x10\n" + "ldr d0, [x17, #0x0]\n" + "sub x8, x8, #0x4\n" + "ldr d10, [x5, #0x20]\n" + "cmp x8, #0x8\n" + "ldr d11, [x5, #0x30]\n" + "ldr x26, [x5, #0x8]\n" + "mov v8.d[1], x26\n" + "ldr x26, [x5, #0x18]\n" + "mov v9.d[1], x26\n" + "ldr x26, [x17, #0x8]\n" + "mov v0.d[1], x26\n" + "ldr x26, [x5, #0x28]\n" + "mov v10.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x17, #0x80]\n" "bge 12b\n" "13:" // Height 1: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" - "ldr q10, [x16, #0x20]\n" - "sub x12, x12, #0x4\n" - "ldr q11, [x16, #0x30]\n" - "add x11, x11, #0x10\n" - "prfm pldl1keep, [x11, #0x80]\n" - "add x16, x16, #0x40\n" + "add x17, x17, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" + "sub x8, x8, #0x4\n" "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" + "add x5, x5, #0x40\n" "14:" // Height 1: Multiply loop: Main loop skip - "cbz x12, 16f\n" + "cbz x8, 16f\n" "15:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "cbnz x12, 15b\n" + "ldr s17, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v17.s[0]\n" + "add x5, x5, #0x10\n" + "cbnz x8, 15b\n" "16:" // Height 1: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 9b\n" - "prfm pstl1keep, [x14, #0x0]\n" + "prfm pstl1keep, [x6, #0x0]\n" "tbz %x[flags], #1, 17f\n" - "add x8, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x8]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" "17:" // Height 1: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 20f\n" - "tbz x17, #1, 18f\n" - "str d24, [x14], #0x8\n" - "tbz x17, #0, 19f\n" - "st1 { v24.s }[2], [x14]\n" + "tbz x4, #1, 18f\n" + "str d24, [x6], #0x8\n" + "tbz x4, #0, 19f\n" + "st1 { v24.s }[2], [x6]\n" "b 19f\n" "18:" // Height 1: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" + "str s24, [x6, #0x0]\n" "19:" // Height 1: Partial direct writeback: Done "b 21f\n" "20:" // Height 1: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" "21:" // Height 1: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 2b\n" "b 170f\n" "22:" // Height 2 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "23:" // Height 2: Column loop - "cbz x15, 24f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 24f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" + "add x3, x3, #0x10\n" "b 29f\n" "24:" // Height 2: no bias "tbz %x[flags], #0, 28f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x4, #0x4\n" + "add x13, x6, x26, LSL #2\n" "bge 27f\n" - "tbz x17, #1, 25f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "tbz x17, #0, 26f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" + "tbz x4, #1, 25f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "tbz x4, #0, 26f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" "b 26f\n" "25:" // Height 2: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" "26:" // Height 2: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 29f\n" "27:" // Height 2: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" "b 29f\n" "28:" // Height 2: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "29:" // Height 2: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "30:" // Height 2: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "cbnz x13, 32f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "cbnz x7, 32f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" "b 32f\n" "31:" // Height 2: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" "32:" // Height 2: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 35f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 34f\n" "33:" // Height 2: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" - "ldr d10, [x16, #0x20]\n" - "add x11, x11, #0x10\n" - "ldr x21, [x16, #0x28]\n" - "add x9, x9, #0x10\n" - "mov v9.d[1], x8\n" - "ldr d11, [x16, #0x30]\n" - "ldr x8, [x16, #0x38]\n" - "sub x12, x12, #0x4\n" + "add x16, x16, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" - "mov v11.d[1], x8\n" - "prfm pldl1keep, [x9, #0x80]\n" - "ldr x10, [x11, #0x8]\n" - "cmp x12, #0x8\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "ldr x28, [x9, #0x8]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "add x16, x16, #0x40\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" + "ldr d10, [x5, #0x20]\n" + "ldr x27, [x5, #0x8]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" - "mov v8.d[1], x26\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" + "ldr d1, [x16, #0x0]\n" + "sub x8, x8, #0x4\n" + "ldr d11, [x5, #0x30]\n" + "cmp x8, #0x8\n" + "ldr x26, [x5, #0x18]\n" + "mov v8.d[1], x27\n" + "ldr x27, [x17, #0x8]\n" + "mov v9.d[1], x26\n" + "ldr x26, [x16, #0x8]\n" + "mov v0.d[1], x27\n" + "ldr x27, [x5, #0x28]\n" + "mov v1.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v10.d[1], x27\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x17, #0x80]\n" + "prfm pldl1keep, [x16, #0x80]\n" "bge 33b\n" "34:" // Height 2: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" - "ldr q11, [x16, #0x30]\n" - "sub x12, x12, #0x4\n" - "add x11, x11, #0x10\n" - "add x9, x9, #0x10\n" + "add x16, x16, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "add x16, x16, #0x40\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" + "add x5, x5, #0x40\n" "fmla v24.4s, v11.4s, v0.s[3]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" "35:" // Height 2: Multiply loop: Main loop skip - "cbz x12, 37f\n" + "cbz x8, 37f\n" "36:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "cbnz x12, 36b\n" + "ldr s18, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s17, [x16], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v18.s[0]\n" + "fmla v25.4s, v16.4s, v17.s[0]\n" + "add x5, x5, #0x10\n" + "cbnz x8, 36b\n" "37:" // Height 2: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 30b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" "tbz %x[flags], #1, 38f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" "38:" // Height 2: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 41f\n" - "tbz x17, #1, 39f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "tbz x17, #0, 40f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" + "tbz x4, #1, 39f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "tbz x4, #0, 40f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" "b 40f\n" "39:" // Height 2: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" "40:" // Height 2: Partial direct writeback: Done "b 42f\n" "41:" // Height 2: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" "42:" // Height 2: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 23b\n" "b 170f\n" "43:" // Height 3 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "44:" // Height 3: Column loop - "cbz x15, 45f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 45f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "b 50f\n" "45:" // Height 3: no bias "tbz %x[flags], #0, 49f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x12, x13, x26, LSL #2\n" "bge 48f\n" - "tbz x17, #1, 46f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "tbz x17, #0, 47f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" + "tbz x4, #1, 46f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "tbz x4, #0, 47f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" "b 47f\n" "46:" // Height 3: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" "47:" // Height 3: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 50f\n" "48:" // Height 3: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" "b 50f\n" "49:" // Height 3: no accumulate "movi v24.16b, #0x0\n" "movi v25.16b, #0x0\n" "movi v26.16b, #0x0\n" "50:" // Height 3: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "51:" // Height 3: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 52f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "cbnz x13, 53f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "cbnz x7, 53f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" "b 53f\n" "52:" // Height 3: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" "53:" // Height 3: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 56f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 55f\n" "54:" // Height 3: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" - "ldr x21, [x16, #0x28]\n" - "add x11, x11, #0x10\n" - "mov v9.d[1], x8\n" - "ldr d11, [x16, #0x30]\n" - "ldr x8, [x16, #0x38]\n" - "add x9, x9, #0x10\n" + "add x15, x15, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "mov v11.d[1], x8\n" - "prfm pldl1keep, [x9, #0x80]\n" - "add x27, x27, #0x10\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "ldr x28, [x5, #0x8]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "ldr x10, [x11, #0x8]\n" + "ldr x27, [x5, #0x18]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr x28, [x9, #0x8]\n" - "ldr x26, [x27, #0x8]\n" - "sub x12, x12, #0x4\n" + "ldr d10, [x5, #0x20]\n" + "ldr x26, [x5, #0x28]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x16, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" - "mov v0.d[1], x10\n" - "cmp x12, #0x8\n" + "ldr d2, [x15, #0x0]\n" + "sub x8, x8, #0x4\n" + "ldr d11, [x5, #0x30]\n" + "cmp x8, #0x8\n" + "ldr x9, [x17, #0x8]\n" + "mov v8.d[1], x28\n" + "ldr x28, [x16, #0x8]\n" + "mov v9.d[1], x27\n" + "ldr x27, [x15, #0x8]\n" + "mov v10.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v0.d[1], x9\n" "mov v1.d[1], x28\n" - "add x16, x16, #0x40\n" - "mov v2.d[1], x26\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" - "mov v8.d[1], x26\n" + "prfm pldl1keep, [x17, #0x80]\n" + "mov v2.d[1], x27\n" + "prfm pldl1keep, [x16, #0x80]\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x15, #0x80]\n" "bge 54b\n" "55:" // Height 3: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" - "sub x12, x12, #0x4\n" - "add x11, x11, #0x10\n" + "add x15, x15, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x9, x9, #0x10\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "add x27, x27, #0x10\n" - "add x16, x16, #0x40\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" "56:" // Height 3: Multiply loop: Main loop skip - "cbz x12, 58f\n" + "cbz x8, 58f\n" "57:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "cbnz x12, 57b\n" + "ldr s19, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s18, [x16], #0x4\n" + "ldr s17, [x15], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v19.s[0]\n" + "fmla v25.4s, v16.4s, v18.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 57b\n" "58:" // Height 3: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 51b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" "tbz %x[flags], #1, 59f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" "59:" // Height 3: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 62f\n" - "tbz x17, #1, 60f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "tbz x17, #0, 61f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" + "tbz x4, #1, 60f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "tbz x4, #0, 61f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" "b 61f\n" "60:" // Height 3: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" "61:" // Height 3: Partial direct writeback: Done "b 63f\n" "62:" // Height 3: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" "63:" // Height 3: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 44b\n" "b 170f\n" "64:" // Height 4 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "65:" // Height 4: Column loop - "cbz x15, 66f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 66f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "mov v27.16b, v24.16b\n" "b 71f\n" "66:" // Height 4: no bias "tbz %x[flags], #0, 70f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" - "add x25, x26, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x11, x12, x26, LSL #2\n" "bge 69f\n" - "tbz x17, #1, 67f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "tbz x17, #0, 68f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" - "ld1 { v27.s }[2], [x25]\n" + "tbz x4, #1, 67f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "ldr d27, [x11], #0x8\n" + "tbz x4, #0, 68f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" + "ld1 { v27.s }[2], [x11]\n" "b 68f\n" "67:" // Height 4: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" - "ldr s27, [x25, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" + "ldr s27, [x11, #0x0]\n" "68:" // Height 4: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 71f\n" "69:" // Height 4: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" - "ldr q27, [x25, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q27, [x11, #0x0]\n" "b 71f\n" "70:" // Height 4: no accumulate "movi v24.16b, #0x0\n" @@ -669,248 +668,248 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "movi v26.16b, #0x0\n" "movi v27.16b, #0x0\n" "71:" // Height 4: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "72:" // Height 4: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "ldr x25, [x20, #0x18]\n" - "cbnz x13, 74f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" - "add x25, x25, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "ldr x14, [x26, #0x18]\n" + "cbnz x7, 74f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" + "add x14, x14, x26, LSL #2\n" "b 74f\n" "73:" // Height 4: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" - "add x25, x27, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" + "add x14, x15, x27, LSL #2\n" "74:" // Height 4: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 77f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q3, [x14, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 76f\n" "75:" // Height 4: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "ldr x21, [x16, #0x28]\n" - "mov v9.d[1], x8\n" - "ldr d11, [x16, #0x30]\n" - "ldr x8, [x16, #0x38]\n" - "add x11, x11, #0x10\n" + "add x14, x14, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "mov v11.d[1], x8\n" + "ldr x27, [x5, #0x8]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "ldr x10, [x11, #0x8]\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x9, x9, #0x10\n" + "ldr x26, [x5, #0x18]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "ldr x11, [x5, #0x28]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x10, [x17, #0x8]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "add x27, x27, #0x10\n" + "ldr d10, [x5, #0x20]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d1, [x16, #0x0]\n" + "ldr x9, [x16, #0x8]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d2, [x15, #0x0]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d3, [x14, #0x0]\n" + "sub x8, x8, #0x4\n" + "ldr d11, [x5, #0x30]\n" + "cmp x8, #0x8\n" + "ldr x28, [x15, #0x8]\n" + "mov v8.d[1], x27\n" + "ldr x27, [x14, #0x8]\n" + "mov v9.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v10.d[1], x11\n" + "prfm pldl1keep, [x17, #0x80]\n" "mov v0.d[1], x10\n" - "ldr x26, [x27, #0x8]\n" - "mov v1.d[1], x28\n" - "add x25, x25, #0x10\n" - "prfm pldl1keep, [x25, #0x80]\n" - "sub x12, x12, #0x4\n" - "mov v2.d[1], x26\n" - "ldr d3, [x25, #0x0]\n" - "ldr x8, [x25, #0x8]\n" - "cmp x12, #0x8\n" - "add x16, x16, #0x40\n" - "ldr d8, [x16, #0x0]\n" - "mov v3.d[1], x8\n" - "ldr x26, [x16, #0x8]\n" - "mov v8.d[1], x26\n" + "prfm pldl1keep, [x16, #0x80]\n" + "mov v1.d[1], x9\n" + "mov v2.d[1], x28\n" + "prfm pldl1keep, [x15, #0x80]\n" + "mov v3.d[1], x27\n" + "prfm pldl1keep, [x14, #0x80]\n" + "mov v11.d[1], x26\n" "bge 75b\n" "76:" // Height 4: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "sub x12, x12, #0x4\n" + "add x14, x14, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "add x11, x11, #0x10\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "add x9, x9, #0x10\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x27, x27, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x25, x25, #0x10\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "add x16, x16, #0x40\n" "fmla v25.4s, v11.4s, v1.s[3]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" "77:" // Height 4: Multiply loop: Main loop skip - "cbz x12, 79f\n" + "cbz x8, 79f\n" "78:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "cbnz x12, 78b\n" + "ldr s20, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s19, [x16], #0x4\n" + "ldr s18, [x15], #0x4\n" + "ldr s17, [x14], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v20.s[0]\n" + "fmla v25.4s, v16.4s, v19.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v18.s[0]\n" + "fmla v27.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 78b\n" "79:" // Height 4: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 72b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x8, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" "tbz %x[flags], #1, 80f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" "fmin v27.4s, v27.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" + "fmax v27.4s, v27.4s, v16.4s\n" "80:" // Height 4: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 83f\n" - "tbz x17, #1, 81f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "str d27, [x25], #0x8\n" - "tbz x17, #0, 82f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" - "st1 { v27.s }[2], [x25]\n" + "tbz x4, #1, 81f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "str d27, [x11], #0x8\n" + "tbz x4, #0, 82f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" + "st1 { v27.s }[2], [x11]\n" "b 82f\n" "81:" // Height 4: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" - "str s27, [x25, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" + "str s27, [x11, #0x0]\n" "82:" // Height 4: Partial direct writeback: Done "b 84f\n" "83:" // Height 4: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" - "str q27, [x25, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" + "str q27, [x11, #0x0]\n" "84:" // Height 4: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 65b\n" "b 170f\n" "85:" // Height 5 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "86:" // Height 5: Column loop - "cbz x15, 87f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 87f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "b 92f\n" "87:" // Height 5: no bias "tbz %x[flags], #0, 91f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" - "add x25, x26, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x10, x11, x26, LSL #2\n" "bge 90f\n" - "tbz x17, #1, 88f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d28, [x24], #0x8\n" - "tbz x17, #0, 89f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" - "ld1 { v27.s }[2], [x25]\n" - "ld1 { v28.s }[2], [x24]\n" + "tbz x4, #1, 88f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "ldr d27, [x11], #0x8\n" + "ldr d28, [x10], #0x8\n" + "tbz x4, #0, 89f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" + "ld1 { v27.s }[2], [x11]\n" + "ld1 { v28.s }[2], [x10]\n" "b 89f\n" "88:" // Height 5: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" - "ldr s27, [x25, #0x0]\n" - "ldr s28, [x24, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" + "ldr s27, [x11, #0x0]\n" + "ldr s28, [x10, #0x0]\n" "89:" // Height 5: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 92f\n" "90:" // Height 5: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" - "ldr q27, [x25, #0x0]\n" - "ldr q28, [x24, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q27, [x11, #0x0]\n" + "ldr q28, [x10, #0x0]\n" "b 92f\n" "91:" // Height 5: no accumulate "movi v24.16b, #0x0\n" @@ -919,283 +918,283 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "movi v27.16b, #0x0\n" "movi v28.16b, #0x0\n" "92:" // Height 5: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "93:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 94f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "ldr x25, [x20, #0x18]\n" - "ldr x24, [x20, #0x20]\n" - "cbnz x13, 95f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" - "add x25, x25, x8, LSL #2\n" - "add x24, x24, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "ldr x14, [x26, #0x18]\n" + "ldr x13, [x26, #0x20]\n" + "cbnz x7, 95f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" + "add x14, x14, x26, LSL #2\n" + "add x13, x13, x26, LSL #2\n" "b 95f\n" "94:" // Height 5: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" - "add x25, x27, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" + "add x14, x15, x27, LSL #2\n" + "add x13, x14, x27, LSL #2\n" "95:" // Height 5: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 98f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x24, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q3, [x14, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 97f\n" "96:" // Height 5: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "ldr x21, [x16, #0x28]\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "mov v9.d[1], x8\n" - "ldr d11, [x16, #0x30]\n" - "add x11, x11, #0x10\n" + "add x13, x13, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "ldr x8, [x16, #0x38]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "ldr x27, [x5, #0x8]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "ldr x10, [x11, #0x8]\n" + "ldr x26, [x5, #0x18]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "mov v11.d[1], x8\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x9, x9, #0x10\n" + "ldr x12, [x5, #0x28]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "ldr x11, [x17, #0x8]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x10, [x16, #0x8]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "add x27, x27, #0x10\n" + "ldr x9, [x15, #0x8]\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "ldr d10, [x5, #0x20]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x16, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x15, #0x0]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "mov v0.d[1], x10\n" + "ldr d3, [x14, #0x0]\n" + "ldr x28, [x14, #0x8]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" - "mov v1.d[1], x28\n" - "ldr x26, [x27, #0x8]\n" - "add x25, x25, #0x10\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x12, x12, #0x4\n" - "mov v2.d[1], x26\n" - "ldr d3, [x25, #0x0]\n" - "ldr x8, [x25, #0x8]\n" - "cmp x12, #0x8\n" - "ldr d4, [x24, #0x0]\n" - "add x16, x16, #0x40\n" - "ldr x21, [x24, #0x8]\n" - "mov v3.d[1], x8\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" - "mov v4.d[1], x21\n" - "mov v8.d[1], x26\n" + "ldr d4, [x13, #0x0]\n" + "sub x8, x8, #0x4\n" + "ldr d11, [x5, #0x30]\n" + "cmp x8, #0x8\n" + "mov v8.d[1], x27\n" + "ldr x27, [x13, #0x8]\n" + "mov v9.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "prfm pldl1keep, [x17, #0x80]\n" + "mov v10.d[1], x12\n" + "prfm pldl1keep, [x16, #0x80]\n" + "mov v0.d[1], x11\n" + "prfm pldl1keep, [x15, #0x80]\n" + "mov v1.d[1], x10\n" + "prfm pldl1keep, [x14, #0x80]\n" + "mov v2.d[1], x9\n" + "mov v3.d[1], x28\n" + "prfm pldl1keep, [x13, #0x80]\n" + "mov v4.d[1], x27\n" + "mov v11.d[1], x26\n" "bge 96b\n" "97:" // Height 5: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "sub x12, x12, #0x4\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x11, x11, #0x10\n" + "add x13, x13, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x9, x9, #0x10\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "add x27, x27, #0x10\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x25, x25, #0x10\n" + "prfm pldl1keep, [x13, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x24, x24, #0x10\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x24, #0x80]\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "add x16, x16, #0x40\n" "fmla v24.4s, v11.4s, v0.s[3]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" "98:" // Height 5: Multiply loop: Main loop skip - "cbz x12, 100f\n" + "cbz x8, 100f\n" "99:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "cbnz x12, 99b\n" + "ldr s21, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s20, [x16], #0x4\n" + "ldr s19, [x15], #0x4\n" + "ldr s18, [x14], #0x4\n" + "ldr s17, [x13], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v21.s[0]\n" + "fmla v25.4s, v16.4s, v20.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v19.s[0]\n" + "fmla v27.4s, v16.4s, v18.s[0]\n" + "fmla v28.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 99b\n" "100:" // Height 5: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 93b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x8, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x8, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x10, #0x0]\n" "tbz %x[flags], #1, 101f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" "fmin v27.4s, v27.4s, v16.4s\n" "fmin v28.4s, v28.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" + "fmax v27.4s, v27.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v16.4s\n" "101:" // Height 5: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 104f\n" - "tbz x17, #1, 102f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "str d27, [x25], #0x8\n" - "str d28, [x24], #0x8\n" - "tbz x17, #0, 103f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" - "st1 { v27.s }[2], [x25]\n" - "st1 { v28.s }[2], [x24]\n" + "tbz x4, #1, 102f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "str d27, [x11], #0x8\n" + "str d28, [x10], #0x8\n" + "tbz x4, #0, 103f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" + "st1 { v27.s }[2], [x11]\n" + "st1 { v28.s }[2], [x10]\n" "b 103f\n" "102:" // Height 5: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" - "str s27, [x25, #0x0]\n" - "str s28, [x24, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" + "str s27, [x11, #0x0]\n" + "str s28, [x10, #0x0]\n" "103:" // Height 5: Partial direct writeback: Done "b 105f\n" "104:" // Height 5: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" - "str q27, [x25, #0x0]\n" - "str q28, [x24, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" + "str q27, [x11, #0x0]\n" + "str q28, [x10, #0x0]\n" "105:" // Height 5: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 86b\n" "b 170f\n" "106:" // Height 6 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "107:" // Height 6: Column loop - "cbz x15, 108f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 108f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" "b 113f\n" "108:" // Height 6: no bias "tbz %x[flags], #0, 112f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" - "add x25, x26, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x9, x10, x26, LSL #2\n" "bge 111f\n" - "tbz x17, #1, 109f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d28, [x24], #0x8\n" - "ldr d29, [x23], #0x8\n" - "tbz x17, #0, 110f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" - "ld1 { v27.s }[2], [x25]\n" - "ld1 { v28.s }[2], [x24]\n" - "ld1 { v29.s }[2], [x23]\n" + "tbz x4, #1, 109f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "ldr d27, [x11], #0x8\n" + "ldr d28, [x10], #0x8\n" + "ldr d29, [x9], #0x8\n" + "tbz x4, #0, 110f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" + "ld1 { v27.s }[2], [x11]\n" + "ld1 { v28.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x9]\n" "b 110f\n" "109:" // Height 6: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" - "ldr s27, [x25, #0x0]\n" - "ldr s28, [x24, #0x0]\n" - "ldr s29, [x23, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" + "ldr s27, [x11, #0x0]\n" + "ldr s28, [x10, #0x0]\n" + "ldr s29, [x9, #0x0]\n" "110:" // Height 6: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 113f\n" "111:" // Height 6: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" - "ldr q27, [x25, #0x0]\n" - "ldr q28, [x24, #0x0]\n" - "ldr q29, [x23, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q27, [x11, #0x0]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q29, [x9, #0x0]\n" "b 113f\n" "112:" // Height 6: no accumulate "movi v24.16b, #0x0\n" @@ -1205,154 +1204,154 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "movi v28.16b, #0x0\n" "movi v29.16b, #0x0\n" "113:" // Height 6: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "114:" // Height 6: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 115f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "ldr x25, [x20, #0x18]\n" - "ldr x24, [x20, #0x20]\n" - "ldr x23, [x20, #0x28]\n" - "cbnz x13, 116f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" - "add x25, x25, x8, LSL #2\n" - "add x24, x24, x8, LSL #2\n" - "add x23, x23, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "ldr x14, [x26, #0x18]\n" + "ldr x13, [x26, #0x20]\n" + "ldr x12, [x26, #0x28]\n" + "cbnz x7, 116f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" + "add x14, x14, x26, LSL #2\n" + "add x13, x13, x26, LSL #2\n" + "add x12, x12, x26, LSL #2\n" "b 116f\n" "115:" // Height 6: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" - "add x25, x27, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" + "add x14, x15, x27, LSL #2\n" + "add x13, x14, x27, LSL #2\n" + "add x12, x13, x27, LSL #2\n" "116:" // Height 6: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 119f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x24, #0x0]\n" - "ldr q5, [x23, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q3, [x14, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q5, [x12, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 118f\n" "117:" // Height 6: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "ldr x21, [x16, #0x28]\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "mov v9.d[1], x8\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "ldr d11, [x16, #0x30]\n" + "add x12, x12, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "mov v10.d[1], x21\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "ldr x8, [x16, #0x38]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "add x11, x11, #0x10\n" + "ldr x9, [x5, #0x8]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "ldr x28, [x5, #0x18]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "mov v11.d[1], x8\n" + "ldr x27, [x5, #0x28]\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "ldr x10, [x11, #0x8]\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x9, x9, #0x10\n" + "ldr x26, [x17, #0x8]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "ldr x11, [x16, #0x8]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x10, [x15, #0x8]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "add x27, x27, #0x10\n" + "sub x8, x8, #0x4\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "cmp x8, #0x8\n" "fmla v29.4s, v10.4s, v5.s[2]\n" - "ldr x26, [x27, #0x8]\n" + "ldr d10, [x5, #0x20]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x16, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x15, #0x0]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "mov v0.d[1], x10\n" + "ldr d3, [x14, #0x0]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" - "mov v1.d[1], x28\n" + "ldr d4, [x13, #0x0]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" - "mov v2.d[1], x26\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x23, x23, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x12, x12, #0x4\n" - "prfm pldl1keep, [x23, #0x80]\n" - "cmp x12, #0x8\n" - "ldr d3, [x25, #0x0]\n" - "add x16, x16, #0x40\n" - "ldr x8, [x25, #0x8]\n" - "ldr d4, [x24, #0x0]\n" - "ldr x21, [x24, #0x8]\n" - "mov v3.d[1], x8\n" - "ldr d5, [x23, #0x0]\n" - "ldr x8, [x23, #0x8]\n" - "mov v4.d[1], x21\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" - "mov v5.d[1], x8\n" - "mov v8.d[1], x26\n" + "ldr d5, [x12, #0x0]\n" + "ldr d11, [x5, #0x30]\n" + "mov v8.d[1], x9\n" + "ldr x9, [x14, #0x8]\n" + "mov v9.d[1], x28\n" + "ldr x28, [x13, #0x8]\n" + "mov v10.d[1], x27\n" + "ldr x27, [x12, #0x8]\n" + "mov v0.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v1.d[1], x11\n" + "prfm pldl1keep, [x17, #0x80]\n" + "mov v2.d[1], x10\n" + "prfm pldl1keep, [x16, #0x80]\n" + "mov v3.d[1], x9\n" + "prfm pldl1keep, [x15, #0x80]\n" + "mov v4.d[1], x28\n" + "prfm pldl1keep, [x14, #0x80]\n" + "mov v5.d[1], x27\n" + "prfm pldl1keep, [x13, #0x80]\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x12, #0x80]\n" "bge 117b\n" "118:" // Height 6: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "sub x12, x12, #0x4\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x11, x11, #0x10\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "add x12, x12, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "add x9, x9, #0x10\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "add x27, x27, #0x10\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "add x25, x25, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x13, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x23, x23, #0x10\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "add x16, x16, #0x40\n" "fmla v29.4s, v10.4s, v5.s[2]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" @@ -1361,108 +1360,108 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "fmla v28.4s, v11.4s, v4.s[3]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" "119:" // Height 6: Multiply loop: Main loop skip - "cbz x12, 121f\n" + "cbz x8, 121f\n" "120:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" - "cbnz x12, 120b\n" + "ldr s22, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s21, [x16], #0x4\n" + "ldr s20, [x15], #0x4\n" + "ldr s19, [x14], #0x4\n" + "ldr s18, [x13], #0x4\n" + "ldr s17, [x12], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v22.s[0]\n" + "fmla v25.4s, v16.4s, v21.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v20.s[0]\n" + "fmla v27.4s, v16.4s, v19.s[0]\n" + "fmla v28.4s, v16.4s, v18.s[0]\n" + "fmla v29.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 120b\n" "121:" // Height 6: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 114b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x8, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x8, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x8, LSL #2\n" - "prfm pstl1keep, [x23, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "add x9, x10, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" "tbz %x[flags], #1, 122f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" "fmin v27.4s, v27.4s, v16.4s\n" "fmin v28.4s, v28.4s, v16.4s\n" "fmin v29.4s, v29.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" + "fmax v27.4s, v27.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v16.4s\n" + "fmax v29.4s, v29.4s, v16.4s\n" "122:" // Height 6: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 125f\n" - "tbz x17, #1, 123f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "str d27, [x25], #0x8\n" - "str d28, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "tbz x17, #0, 124f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" - "st1 { v27.s }[2], [x25]\n" - "st1 { v28.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" + "tbz x4, #1, 123f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "str d27, [x11], #0x8\n" + "str d28, [x10], #0x8\n" + "str d29, [x9], #0x8\n" + "tbz x4, #0, 124f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" + "st1 { v27.s }[2], [x11]\n" + "st1 { v28.s }[2], [x10]\n" + "st1 { v29.s }[2], [x9]\n" "b 124f\n" "123:" // Height 6: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" - "str s27, [x25, #0x0]\n" - "str s28, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" + "str s27, [x11, #0x0]\n" + "str s28, [x10, #0x0]\n" + "str s29, [x9, #0x0]\n" "124:" // Height 6: Partial direct writeback: Done "b 126f\n" "125:" // Height 6: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" - "str q27, [x25, #0x0]\n" - "str q28, [x24, #0x0]\n" - "str q29, [x23, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" + "str q27, [x11, #0x0]\n" + "str q28, [x10, #0x0]\n" + "str q29, [x9, #0x0]\n" "126:" // Height 6: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 107b\n" "b 170f\n" "127:" // Height 7 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" "128:" // Height 7: Column loop - "cbz x15, 129f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 129f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" @@ -1470,53 +1469,53 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "b 134f\n" "129:" // Height 7: no bias "tbz %x[flags], #0, 133f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" - "add x25, x26, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" - "add x22, x23, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "add x9, x10, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x28, x9, x26, LSL #2\n" "bge 132f\n" - "tbz x17, #1, 130f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d28, [x24], #0x8\n" - "ldr d29, [x23], #0x8\n" - "ldr d30, [x22], #0x8\n" - "tbz x17, #0, 131f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" - "ld1 { v27.s }[2], [x25]\n" - "ld1 { v28.s }[2], [x24]\n" - "ld1 { v29.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x22]\n" + "tbz x4, #1, 130f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "ldr d27, [x11], #0x8\n" + "ldr d28, [x10], #0x8\n" + "ldr d29, [x9], #0x8\n" + "ldr d30, [x28], #0x8\n" + "tbz x4, #0, 131f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" + "ld1 { v27.s }[2], [x11]\n" + "ld1 { v28.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x9]\n" + "ld1 { v30.s }[2], [x28]\n" "b 131f\n" "130:" // Height 7: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" - "ldr s27, [x25, #0x0]\n" - "ldr s28, [x24, #0x0]\n" - "ldr s29, [x23, #0x0]\n" - "ldr s30, [x22, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" + "ldr s27, [x11, #0x0]\n" + "ldr s28, [x10, #0x0]\n" + "ldr s29, [x9, #0x0]\n" + "ldr s30, [x28, #0x0]\n" "131:" // Height 7: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 134f\n" "132:" // Height 7: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" - "ldr q27, [x25, #0x0]\n" - "ldr q28, [x24, #0x0]\n" - "ldr q29, [x23, #0x0]\n" - "ldr q30, [x22, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q27, [x11, #0x0]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q29, [x9, #0x0]\n" + "ldr q30, [x28, #0x0]\n" "b 134f\n" "133:" // Height 7: no accumulate "movi v24.16b, #0x0\n" @@ -1527,171 +1526,171 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "movi v29.16b, #0x0\n" "movi v30.16b, #0x0\n" "134:" // Height 7: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "135:" // Height 7: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 136f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "ldr x25, [x20, #0x18]\n" - "ldr x24, [x20, #0x20]\n" - "ldr x23, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" - "cbnz x13, 137f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" - "add x25, x25, x8, LSL #2\n" - "add x24, x24, x8, LSL #2\n" - "add x23, x23, x8, LSL #2\n" - "add x22, x22, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "ldr x14, [x26, #0x18]\n" + "ldr x13, [x26, #0x20]\n" + "ldr x12, [x26, #0x28]\n" + "ldr x11, [x26, #0x30]\n" + "cbnz x7, 137f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" + "add x14, x14, x26, LSL #2\n" + "add x13, x13, x26, LSL #2\n" + "add x12, x12, x26, LSL #2\n" + "add x11, x11, x26, LSL #2\n" "b 137f\n" "136:" // Height 7: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" - "add x25, x27, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" - "add x22, x23, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" + "add x14, x15, x27, LSL #2\n" + "add x13, x14, x27, LSL #2\n" + "add x12, x13, x27, LSL #2\n" + "add x11, x12, x27, LSL #2\n" "137:" // Height 7: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 140f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x24, #0x0]\n" - "ldr q5, [x23, #0x0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q3, [x14, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q5, [x12, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 139f\n" "138:" // Height 7: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "ldr x21, [x16, #0x28]\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "mov v9.d[1], x8\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "ldr d11, [x16, #0x30]\n" + "add x12, x12, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "mov v10.d[1], x21\n" + "add x11, x11, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "ldr x8, [x16, #0x38]\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x11, x11, #0x10\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "ldr x26, [x5, #0x8]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "mov v11.d[1], x8\n" + "ldr x10, [x5, #0x18]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "ldr x10, [x11, #0x8]\n" + "ldr x9, [x5, #0x28]\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "add x9, x9, #0x10\n" + "ldr x28, [x17, #0x8]\n" "fmla v30.4s, v9.4s, v6.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x27, [x16, #0x8]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "add x27, x27, #0x10\n" + "sub x8, x8, #0x4\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "cmp x8, #0x8\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "ldr x26, [x27, #0x8]\n" + "mov v8.d[1], x26\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "add x25, x25, #0x10\n" + "ldr x26, [x15, #0x8]\n" "fmla v29.4s, v10.4s, v5.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v30.4s, v10.4s, v6.s[2]\n" - "ldr x8, [x25, #0x8]\n" + "ldr d10, [x5, #0x20]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x16, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x15, #0x0]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "mov v0.d[1], x10\n" + "ldr d3, [x14, #0x0]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" - "mov v1.d[1], x28\n" + "ldr d4, [x13, #0x0]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" - "mov v2.d[1], x26\n" + "ldr d5, [x12, #0x0]\n" "fmla v30.4s, v11.4s, v6.s[3]\n" - "ldr d3, [x25, #0x0]\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "add x22, x22, #0x10\n" - "mov v3.d[1], x8\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "sub x12, x12, #0x4\n" - "ldr d4, [x24, #0x0]\n" - "cmp x12, #0x8\n" - "ldr x21, [x24, #0x8]\n" - "add x16, x16, #0x40\n" - "ldr d8, [x16, #0x0]\n" - "ldr x26, [x16, #0x8]\n" - "mov v4.d[1], x21\n" - "ldr d5, [x23, #0x0]\n" - "ldr x8, [x23, #0x8]\n" - "mov v8.d[1], x26\n" - "ldr d6, [x22, #0x0]\n" - "ldr x21, [x22, #0x8]\n" - "mov v5.d[1], x8\n" - "mov v6.d[1], x21\n" + "ldr d6, [x11, #0x0]\n" + "ldr d11, [x5, #0x30]\n" + "mov v9.d[1], x10\n" + "ldr x10, [x14, #0x8]\n" + "mov v10.d[1], x9\n" + "ldr x9, [x13, #0x8]\n" + "mov v0.d[1], x28\n" + "ldr x28, [x12, #0x8]\n" + "mov v1.d[1], x27\n" + "ldr x27, [x11, #0x8]\n" + "mov v2.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v3.d[1], x10\n" + "prfm pldl1keep, [x16, #0x80]\n" + "mov v4.d[1], x9\n" + "prfm pldl1keep, [x15, #0x80]\n" + "mov v5.d[1], x28\n" + "prfm pldl1keep, [x14, #0x80]\n" + "mov v6.d[1], x27\n" + "prfm pldl1keep, [x13, #0x80]\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "bge 138b\n" "139:" // Height 7: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "sub x12, x12, #0x4\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x11, x11, #0x10\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "add x12, x12, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "add x27, x27, #0x10\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "add x25, x25, #0x10\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x13, #0x80]\n" "fmla v30.4s, v9.4s, v6.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x23, x23, #0x10\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x22, x22, #0x10\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x22, #0x80]\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "add x16, x16, #0x40\n" "fmla v29.4s, v10.4s, v5.s[2]\n" "fmla v30.4s, v10.4s, v6.s[2]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" @@ -1702,50 +1701,48 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "fmla v29.4s, v11.4s, v5.s[3]\n" "fmla v30.4s, v11.4s, v6.s[3]\n" "140:" // Height 7: Multiply loop: Main loop skip - "cbz x12, 142f\n" + "cbz x8, 142f\n" "141:" // Height 7: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" - "fmla v30.4s, v12.4s, v6.s[0]\n" - "cbnz x12, 141b\n" + "ldr s23, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s22, [x16], #0x4\n" + "ldr s21, [x15], #0x4\n" + "ldr s20, [x14], #0x4\n" + "ldr s19, [x13], #0x4\n" + "ldr s18, [x12], #0x4\n" + "ldr s17, [x11], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v23.s[0]\n" + "fmla v25.4s, v16.4s, v22.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v21.s[0]\n" + "fmla v27.4s, v16.4s, v20.s[0]\n" + "fmla v28.4s, v16.4s, v19.s[0]\n" + "fmla v29.4s, v16.4s, v18.s[0]\n" + "fmla v30.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 141b\n" "142:" // Height 7: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 135b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" - "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x8, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x8, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x8, LSL #2\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add x22, x23, x8, LSL #2\n" - "prfm pstl1keep, [x22, #0x0]\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "add x9, x10, x26, LSL #2\n" + "add x28, x9, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x28, #0x0]\n" "tbz %x[flags], #1, 143f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" @@ -1753,70 +1750,72 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "fmin v28.4s, v28.4s, v16.4s\n" "fmin v29.4s, v29.4s, v16.4s\n" "fmin v30.4s, v30.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" - "fmax v30.4s, v30.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" + "fmax v27.4s, v27.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v16.4s\n" + "fmax v29.4s, v29.4s, v16.4s\n" + "fmax v30.4s, v30.4s, v16.4s\n" "143:" // Height 7: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 146f\n" - "tbz x17, #1, 144f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "str d27, [x25], #0x8\n" - "str d28, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "str d30, [x22], #0x8\n" - "tbz x17, #0, 145f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" - "st1 { v27.s }[2], [x25]\n" - "st1 { v28.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" - "st1 { v30.s }[2], [x22]\n" + "tbz x4, #1, 144f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "str d27, [x11], #0x8\n" + "str d28, [x10], #0x8\n" + "str d29, [x9], #0x8\n" + "str d30, [x28], #0x8\n" + "tbz x4, #0, 145f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" + "st1 { v27.s }[2], [x11]\n" + "st1 { v28.s }[2], [x10]\n" + "st1 { v29.s }[2], [x9]\n" + "st1 { v30.s }[2], [x28]\n" "b 145f\n" "144:" // Height 7: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" - "str s27, [x25, #0x0]\n" - "str s28, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" - "str s30, [x22, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" + "str s27, [x11, #0x0]\n" + "str s28, [x10, #0x0]\n" + "str s29, [x9, #0x0]\n" + "str s30, [x28, #0x0]\n" "145:" // Height 7: Partial direct writeback: Done "b 147f\n" "146:" // Height 7: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" - "str q27, [x25, #0x0]\n" - "str q28, [x24, #0x0]\n" - "str q29, [x23, #0x0]\n" - "str q30, [x22, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" + "str q27, [x11, #0x0]\n" + "str q28, [x10, #0x0]\n" + "str q29, [x9, #0x0]\n" + "str q30, [x28, #0x0]\n" "147:" // Height 7: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 128b\n" "b 170f\n" "148:" // Height 8 - "ldr x17, [%x[args_ptr], %[offsetof_N]]\n" - "mov x15, %x[bias]\n" - "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "mov x8, #0x20\n" - "madd %x[output_ptr], x20, x8, %x[output_ptr]\n" + "ldr x27, [%x[args_ptr], %[offsetof_output_offset]]\n" + "mov x26, #0x20\n" + "mov x3, %x[bias]\n" + "ldr x4, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x5, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x6, %x[output_ptr]\n" + "madd %x[output_ptr], x27, x26, %x[output_ptr]\n" "149:" // Height 8: Column loop - "cbz x15, 150f\n" - "ldr q24, [x15, #0x0]\n" - "add x15, x15, #0x10\n" + "cbz x3, 150f\n" + "ldr q24, [x3, #0x0]\n" "mov v25.16b, v24.16b\n" "mov v26.16b, v24.16b\n" + "add x3, x3, #0x10\n" "mov v27.16b, v24.16b\n" "mov v28.16b, v24.16b\n" "mov v29.16b, v24.16b\n" @@ -1825,58 +1824,58 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "b 155f\n" "150:" // Height 8: no bias "tbz %x[flags], #0, 154f\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x17, #0x4\n" - "add x27, x14, x8, LSL #2\n" - "add x26, x27, x8, LSL #2\n" - "add x25, x26, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" - "add x22, x23, x8, LSL #2\n" - "add x21, x22, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "add x9, x10, x26, LSL #2\n" + "add x28, x9, x26, LSL #2\n" + "cmp x4, #0x4\n" + "add x27, x28, x26, LSL #2\n" "bge 153f\n" - "tbz x17, #1, 151f\n" - "ldr d24, [x14], #0x8\n" - "ldr d25, [x27], #0x8\n" - "mov x8, #0x8\n" - "ldr d26, [x26], #0x8\n" - "ldr d27, [x25], #0x8\n" - "ldr d28, [x24], #0x8\n" - "ldr d29, [x23], #0x8\n" - "ldr d30, [x22], #0x8\n" - "ldr d31, [x21], #0x8\n" - "tbz x17, #0, 152f\n" - "ld1 { v24.s }[2], [x14]\n" - "ld1 { v25.s }[2], [x27]\n" - "ld1 { v26.s }[2], [x26]\n" - "ld1 { v27.s }[2], [x25]\n" - "ld1 { v28.s }[2], [x24]\n" - "ld1 { v29.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x22]\n" - "ld1 { v31.s }[2], [x21]\n" + "tbz x4, #1, 151f\n" + "ldr d24, [x6], #0x8\n" + "mov x26, #0x8\n" + "ldr d25, [x13], #0x8\n" + "ldr d26, [x12], #0x8\n" + "ldr d27, [x11], #0x8\n" + "ldr d28, [x10], #0x8\n" + "ldr d29, [x9], #0x8\n" + "ldr d30, [x28], #0x8\n" + "ldr d31, [x27], #0x8\n" + "tbz x4, #0, 152f\n" + "ld1 { v24.s }[2], [x6]\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v26.s }[2], [x12]\n" + "ld1 { v27.s }[2], [x11]\n" + "ld1 { v28.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x9]\n" + "ld1 { v30.s }[2], [x28]\n" + "ld1 { v31.s }[2], [x27]\n" "b 152f\n" "151:" // Height 8: Partial accumulate: partial_1_0 - "ldr s24, [x14, #0x0]\n" - "mov x8, #0x0\n" - "ldr s25, [x27, #0x0]\n" - "ldr s26, [x26, #0x0]\n" - "ldr s27, [x25, #0x0]\n" - "ldr s28, [x24, #0x0]\n" - "ldr s29, [x23, #0x0]\n" - "ldr s30, [x22, #0x0]\n" - "ldr s31, [x21, #0x0]\n" + "ldr s24, [x6, #0x0]\n" + "mov x26, #0x0\n" + "ldr s25, [x13, #0x0]\n" + "ldr s26, [x12, #0x0]\n" + "ldr s27, [x11, #0x0]\n" + "ldr s28, [x10, #0x0]\n" + "ldr s29, [x9, #0x0]\n" + "ldr s30, [x28, #0x0]\n" + "ldr s31, [x27, #0x0]\n" "152:" // Height 8: Partial accumulate: Done - "sub x14, x14, x8\n" + "sub x6, x6, x26\n" "b 155f\n" "153:" // Height 8: full accumulate - "ldr q24, [x14, #0x0]\n" - "ldr q25, [x27, #0x0]\n" - "ldr q26, [x26, #0x0]\n" - "ldr q27, [x25, #0x0]\n" - "ldr q28, [x24, #0x0]\n" - "ldr q29, [x23, #0x0]\n" - "ldr q30, [x22, #0x0]\n" - "ldr q31, [x21, #0x0]\n" + "ldr q24, [x6, #0x0]\n" + "ldr q25, [x13, #0x0]\n" + "ldr q26, [x12, #0x0]\n" + "ldr q27, [x11, #0x0]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q29, [x9, #0x0]\n" + "ldr q30, [x28, #0x0]\n" + "ldr q31, [x27, #0x0]\n" "b 155f\n" "154:" // Height 8: no accumulate "movi v24.16b, #0x0\n" @@ -1888,188 +1887,188 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "movi v30.16b, #0x0\n" "movi v31.16b, #0x0\n" "155:" // Height 8: setup done - "mov x13, #0x0\n" + "mov x7, #0x0\n" "156:" // Height 8: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w12, [x20, x13, LSL #0x2]\n" + "ldr x26, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w8, [x26, x7, LSL #0x2]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 157f\n" - "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n" - "add x20, x20, x8, LSL #3\n" - "ldr x11, [x20, #0x0]\n" - "ldr x9, [x20, #0x8]\n" - "ldr x27, [x20, #0x10]\n" - "ldr x25, [x20, #0x18]\n" - "ldr x24, [x20, #0x20]\n" - "ldr x23, [x20, #0x28]\n" - "ldr x22, [x20, #0x30]\n" - "ldr x20, [x20, #0x38]\n" - "cbnz x13, 158f\n" - "ldr x8, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x11, x11, x8, LSL #2\n" - "add x9, x9, x8, LSL #2\n" - "add x27, x27, x8, LSL #2\n" - "add x25, x25, x8, LSL #2\n" - "add x24, x24, x8, LSL #2\n" - "add x23, x23, x8, LSL #2\n" - "add x22, x22, x8, LSL #2\n" - "add x20, x20, x8, LSL #2\n" + "ldr x26, [%x[input_ptr], x7, LSL #0x3]\n" + "add x26, x26, x27, LSL #3\n" + "ldr x17, [x26, #0x0]\n" + "ldr x16, [x26, #0x8]\n" + "ldr x15, [x26, #0x10]\n" + "ldr x14, [x26, #0x18]\n" + "ldr x13, [x26, #0x20]\n" + "ldr x12, [x26, #0x28]\n" + "ldr x11, [x26, #0x30]\n" + "ldr x27, [x26, #0x38]\n" + "cbnz x7, 158f\n" + "ldr x26, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x17, x17, x26, LSL #2\n" + "add x16, x16, x26, LSL #2\n" + "add x15, x15, x26, LSL #2\n" + "add x14, x14, x26, LSL #2\n" + "add x13, x13, x26, LSL #2\n" + "add x12, x12, x26, LSL #2\n" + "add x11, x11, x26, LSL #2\n" + "add x27, x27, x26, LSL #2\n" "b 158f\n" "157:" // Height 8: setup direct input - "mov x11, %x[input_ptr]\n" - "add x9, x11, x8, LSL #2\n" - "add x27, x9, x8, LSL #2\n" - "add x25, x27, x8, LSL #2\n" - "add x24, x25, x8, LSL #2\n" - "add x23, x24, x8, LSL #2\n" - "add x22, x23, x8, LSL #2\n" - "add x20, x22, x8, LSL #2\n" + "mov x17, %x[input_ptr]\n" + "add x16, x17, x27, LSL #2\n" + "add x15, x16, x27, LSL #2\n" + "add x14, x15, x27, LSL #2\n" + "add x13, x14, x27, LSL #2\n" + "add x12, x13, x27, LSL #2\n" + "add x11, x12, x27, LSL #2\n" + "add x27, x11, x27, LSL #2\n" "158:" // Height 8: input setup done - "cmp x12, #0x4\n" + "cmp x8, #0x4\n" "blt 161f\n" - "ldr q0, [x11, #0x0]\n" - "ldr q1, [x9, #0x0]\n" - "cmp x12, #0x8\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x24, #0x0]\n" - "ldr q5, [x23, #0x0]\n" - "ldr q6, [x22, #0x0]\n" - "ldr q7, [x20, #0x0]\n" - "ldr q8, [x16, #0x0]\n" + "ldr q0, [x17, #0x0]\n" + "cmp x8, #0x8\n" + "ldr q1, [x16, #0x0]\n" + "ldr q2, [x15, #0x0]\n" + "ldr q3, [x14, #0x0]\n" + "ldr q4, [x13, #0x0]\n" + "ldr q5, [x12, #0x0]\n" + "ldr q6, [x11, #0x0]\n" + "ldr q7, [x27, #0x0]\n" + "ldr q8, [x5, #0x0]\n" + "ldr q9, [x5, #0x10]\n" + "ldr q10, [x5, #0x20]\n" + "ldr q11, [x5, #0x30]\n" "blt 160f\n" "159:" // Height 8: Multiply loop: Main loop head "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr d9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr x8, [x16, #0x18]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr d10, [x16, #0x20]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "ldr x21, [x16, #0x28]\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "mov v9.d[1], x8\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "ldr d11, [x16, #0x30]\n" + "add x12, x12, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "mov v10.d[1], x21\n" + "add x11, x11, #0x10\n" "fmla v31.4s, v8.4s, v7.s[0]\n" - "ldr x8, [x16, #0x38]\n" + "add x27, x27, #0x10\n" "fmla v24.4s, v9.4s, v0.s[1]\n" - "add x11, x11, #0x10\n" + "add x5, x5, #0x40\n" + "ldr d8, [x5, #0x0]\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x11, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "mov v11.d[1], x8\n" + "ldr x26, [x5, #0x8]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "ldr x10, [x11, #0x8]\n" + "sub x8, x8, #0x4\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "add x9, x9, #0x10\n" + "cmp x8, #0x8\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v8.d[1], x26\n" "fmla v30.4s, v9.4s, v6.s[1]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x26, [x5, #0x18]\n" "fmla v31.4s, v9.4s, v7.s[1]\n" - "add x27, x27, #0x10\n" + "ldr d9, [x5, #0x10]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "ldr x26, [x27, #0x8]\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x25, x25, #0x10\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "mov v9.d[1], x26\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "ldr x8, [x25, #0x8]\n" + "ldr x26, [x5, #0x28]\n" "fmla v29.4s, v10.4s, v5.s[2]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v30.4s, v10.4s, v6.s[2]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x13, #0x80]\n" "fmla v31.4s, v10.4s, v7.s[2]\n" - "ldr x21, [x24, #0x8]\n" + "ldr d10, [x5, #0x20]\n" "fmla v24.4s, v11.4s, v0.s[3]\n" - "ldr d0, [x11, #0x0]\n" + "ldr d0, [x17, #0x0]\n" "fmla v25.4s, v11.4s, v1.s[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x16, #0x0]\n" "fmla v26.4s, v11.4s, v2.s[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x15, #0x0]\n" "fmla v27.4s, v11.4s, v3.s[3]\n" - "mov v0.d[1], x10\n" + "ldr d3, [x14, #0x0]\n" "fmla v28.4s, v11.4s, v4.s[3]\n" - "mov v1.d[1], x28\n" + "ldr d4, [x13, #0x0]\n" "fmla v29.4s, v11.4s, v5.s[3]\n" - "mov v2.d[1], x26\n" + "ldr d5, [x12, #0x0]\n" "fmla v30.4s, v11.4s, v6.s[3]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d6, [x11, #0x0]\n" "fmla v31.4s, v11.4s, v7.s[3]\n" - "ldr d4, [x24, #0x0]\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "mov v3.d[1], x8\n" - "prfm pldl1keep, [x23, #0x80]\n" - "mov v4.d[1], x21\n" - "prfm pldl1keep, [x22, #0x80]\n" - "ldr d5, [x23, #0x0]\n" - "add x20, x20, #0x10\n" - "prfm pldl1keep, [x20, #0x80]\n" - "sub x12, x12, #0x4\n" - "ldr x8, [x23, #0x8]\n" - "cmp x12, #0x8\n" - "ldr d6, [x22, #0x0]\n" - "add x16, x16, #0x40\n" - "ldr d8, [x16, #0x0]\n" - "mov v5.d[1], x8\n" + "ldr d7, [x27, #0x0]\n" + "ldr d11, [x5, #0x30]\n" + "mov v10.d[1], x26\n" + "ldr x26, [x17, #0x8]\n" + "mov v0.d[1], x26\n" "ldr x26, [x16, #0x8]\n" - "ldr x21, [x22, #0x8]\n" - "ldr d7, [x20, #0x0]\n" - "mov v8.d[1], x26\n" - "ldr x8, [x20, #0x8]\n" - "mov v6.d[1], x21\n" - "mov v7.d[1], x8\n" + "mov v1.d[1], x26\n" + "ldr x26, [x15, #0x8]\n" + "mov v2.d[1], x26\n" + "ldr x26, [x14, #0x8]\n" + "mov v3.d[1], x26\n" + "ldr x26, [x13, #0x8]\n" + "mov v4.d[1], x26\n" + "ldr x26, [x12, #0x8]\n" + "mov v5.d[1], x26\n" + "ldr x26, [x11, #0x8]\n" + "mov v6.d[1], x26\n" + "ldr x26, [x27, #0x8]\n" + "mov v7.d[1], x26\n" + "ldr x26, [x5, #0x38]\n" + "mov v11.d[1], x26\n" + "prfm pldl1keep, [x12, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" "bge 159b\n" "160:" // Height 8: Multiply loop: Single iteration only "fmla v24.4s, v8.4s, v0.s[0]\n" - "ldr q9, [x16, #0x10]\n" + "add x17, x17, #0x10\n" "fmla v25.4s, v8.4s, v1.s[0]\n" - "ldr q10, [x16, #0x20]\n" + "add x16, x16, #0x10\n" "fmla v26.4s, v8.4s, v2.s[0]\n" - "ldr q11, [x16, #0x30]\n" + "add x15, x15, #0x10\n" "fmla v27.4s, v8.4s, v3.s[0]\n" - "sub x12, x12, #0x4\n" + "add x14, x14, #0x10\n" "fmla v28.4s, v8.4s, v4.s[0]\n" - "add x11, x11, #0x10\n" + "add x13, x13, #0x10\n" "fmla v29.4s, v8.4s, v5.s[0]\n" - "prfm pldl1keep, [x11, #0x80]\n" + "add x12, x12, #0x10\n" "fmla v30.4s, v8.4s, v6.s[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" "fmla v31.4s, v8.4s, v7.s[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - "fmla v24.4s, v9.4s, v0.s[1]\n" "add x27, x27, #0x10\n" + "fmla v24.4s, v9.4s, v0.s[1]\n" + "sub x8, x8, #0x4\n" "fmla v25.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x17, #0x80]\n" "fmla v26.4s, v9.4s, v2.s[1]\n" - "add x25, x25, #0x10\n" + "prfm pldl1keep, [x16, #0x80]\n" "fmla v27.4s, v9.4s, v3.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x15, #0x80]\n" "fmla v28.4s, v9.4s, v4.s[1]\n" - "add x24, x24, #0x10\n" + "prfm pldl1keep, [x14, #0x80]\n" "fmla v29.4s, v9.4s, v5.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x13, #0x80]\n" "fmla v30.4s, v9.4s, v6.s[1]\n" - "add x23, x23, #0x10\n" + "prfm pldl1keep, [x12, #0x80]\n" "fmla v31.4s, v9.4s, v7.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" "fmla v24.4s, v10.4s, v0.s[2]\n" - "add x22, x22, #0x10\n" + "prfm pldl1keep, [x27, #0x80]\n" "fmla v25.4s, v10.4s, v1.s[2]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "add x5, x5, #0x40\n" "fmla v26.4s, v10.4s, v2.s[2]\n" - "add x20, x20, #0x10\n" "fmla v27.4s, v10.4s, v3.s[2]\n" - "prfm pldl1keep, [x20, #0x80]\n" "fmla v28.4s, v10.4s, v4.s[2]\n" - "add x16, x16, #0x40\n" "fmla v29.4s, v10.4s, v5.s[2]\n" "fmla v30.4s, v10.4s, v6.s[2]\n" "fmla v31.4s, v10.4s, v7.s[2]\n" @@ -2082,54 +2081,52 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "fmla v30.4s, v11.4s, v6.s[3]\n" "fmla v31.4s, v11.4s, v7.s[3]\n" "161:" // Height 8: Multiply loop: Main loop skip - "cbz x12, 163f\n" + "cbz x8, 163f\n" "162:" // Height 8: Multiply loop: Odd block loop - "ldr s0, [x11], #0x4\n" - "sub x12, x12, #0x1\n" - "ldr s1, [x9], #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr s7, [x20], #0x4\n" - "ldr q12, [x16, #0x0]\n" - "add x16, x16, #0x10\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" - "fmla v30.4s, v12.4s, v6.s[0]\n" - "fmla v31.4s, v12.4s, v7.s[0]\n" - "cbnz x12, 162b\n" + "ldr s0, [x17], #0x4\n" + "sub x8, x8, #0x1\n" + "ldr s23, [x16], #0x4\n" + "ldr s22, [x15], #0x4\n" + "ldr s21, [x14], #0x4\n" + "ldr s20, [x13], #0x4\n" + "ldr s19, [x12], #0x4\n" + "ldr s18, [x11], #0x4\n" + "ldr s17, [x27], #0x4\n" + "ldr q16, [x5, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "fmla v25.4s, v16.4s, v23.s[0]\n" + "add x5, x5, #0x10\n" + "fmla v26.4s, v16.4s, v22.s[0]\n" + "fmla v27.4s, v16.4s, v21.s[0]\n" + "fmla v28.4s, v16.4s, v20.s[0]\n" + "fmla v29.4s, v16.4s, v19.s[0]\n" + "fmla v30.4s, v16.4s, v18.s[0]\n" + "fmla v31.4s, v16.4s, v17.s[0]\n" + "cbnz x8, 162b\n" "163:" // Height 8: Multiply loop: No odd multiplies - "ldr w8, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x13, x13, #0x1\n" - "cmp x13, x8\n" + "ldr w26, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x7, x7, #0x1\n" + "cmp x7, x26\n" "bne 156b\n" - "ldr x8, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x14, #0x0]\n" - "add x27, x14, x8, LSL #2\n" + "ldr x26, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x13, x6, x26, LSL #2\n" + "add x12, x13, x26, LSL #2\n" + "add x11, x12, x26, LSL #2\n" + "add x10, x11, x26, LSL #2\n" + "add x9, x10, x26, LSL #2\n" + "add x28, x9, x26, LSL #2\n" + "add x27, x28, x26, LSL #2\n" + "prfm pstl1keep, [x6, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x12, #0x0]\n" + "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x10, #0x0]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "prfm pstl1keep, [x28, #0x0]\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x26, x27, x8, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x8, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x8, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x8, LSL #2\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add x22, x23, x8, LSL #2\n" - "prfm pstl1keep, [x22, #0x0]\n" - "add x21, x22, x8, LSL #2\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbz %x[flags], #1, 164f\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "add x8, %x[args_ptr], %[offset_max]\n" - "ld1r { v17.4s }, [x20]\n" - "ld1r { v16.4s }, [x8]\n" + "add x26, %x[args_ptr], %[offset_max]\n" + "ld1r { v16.4s }, [x26]\n" "fmin v24.4s, v24.4s, v16.4s\n" "fmin v25.4s, v25.4s, v16.4s\n" "fmin v26.4s, v26.4s, v16.4s\n" @@ -2138,76 +2135,77 @@ void a64_hybrid_fp32_mla_8x4_a55 ( "fmin v29.4s, v29.4s, v16.4s\n" "fmin v30.4s, v30.4s, v16.4s\n" "fmin v31.4s, v31.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v17.4s\n" - "fmax v25.4s, v25.4s, v17.4s\n" - "fmax v26.4s, v26.4s, v17.4s\n" - "fmax v27.4s, v27.4s, v17.4s\n" - "fmax v28.4s, v28.4s, v17.4s\n" - "fmax v29.4s, v29.4s, v17.4s\n" - "fmax v30.4s, v30.4s, v17.4s\n" - "fmax v31.4s, v31.4s, v17.4s\n" + "add x26, %x[args_ptr], %[offset_min]\n" + "ld1r { v16.4s }, [x26]\n" + "fmax v24.4s, v24.4s, v16.4s\n" + "fmax v25.4s, v25.4s, v16.4s\n" + "fmax v26.4s, v26.4s, v16.4s\n" + "fmax v27.4s, v27.4s, v16.4s\n" + "fmax v28.4s, v28.4s, v16.4s\n" + "fmax v29.4s, v29.4s, v16.4s\n" + "fmax v30.4s, v30.4s, v16.4s\n" + "fmax v31.4s, v31.4s, v16.4s\n" "164:" // Height 8: No activation - "cmp x17, #0x4\n" + "cmp x4, #0x4\n" "bge 167f\n" - "tbz x17, #1, 165f\n" - "str d24, [x14], #0x8\n" - "str d25, [x27], #0x8\n" - "str d26, [x26], #0x8\n" - "str d27, [x25], #0x8\n" - "str d28, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "str d30, [x22], #0x8\n" - "str d31, [x21], #0x8\n" - "tbz x17, #0, 166f\n" - "st1 { v24.s }[2], [x14]\n" - "st1 { v25.s }[2], [x27]\n" - "st1 { v26.s }[2], [x26]\n" - "st1 { v27.s }[2], [x25]\n" - "st1 { v28.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" - "st1 { v30.s }[2], [x22]\n" - "st1 { v31.s }[2], [x21]\n" + "tbz x4, #1, 165f\n" + "str d24, [x6], #0x8\n" + "str d25, [x13], #0x8\n" + "str d26, [x12], #0x8\n" + "str d27, [x11], #0x8\n" + "str d28, [x10], #0x8\n" + "str d29, [x9], #0x8\n" + "str d30, [x28], #0x8\n" + "str d31, [x27], #0x8\n" + "tbz x4, #0, 166f\n" + "st1 { v24.s }[2], [x6]\n" + "st1 { v25.s }[2], [x13]\n" + "st1 { v26.s }[2], [x12]\n" + "st1 { v27.s }[2], [x11]\n" + "st1 { v28.s }[2], [x10]\n" + "st1 { v29.s }[2], [x9]\n" + "st1 { v30.s }[2], [x28]\n" + "st1 { v31.s }[2], [x27]\n" "b 166f\n" "165:" // Height 8: Partial direct writeback: partial_1_0 - "str s24, [x14, #0x0]\n" - "str s25, [x27, #0x0]\n" - "str s26, [x26, #0x0]\n" - "str s27, [x25, #0x0]\n" - "str s28, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" - "str s30, [x22, #0x0]\n" - "str s31, [x21, #0x0]\n" + "str s24, [x6, #0x0]\n" + "str s25, [x13, #0x0]\n" + "str s26, [x12, #0x0]\n" + "str s27, [x11, #0x0]\n" + "str s28, [x10, #0x0]\n" + "str s29, [x9, #0x0]\n" + "str s30, [x28, #0x0]\n" + "str s31, [x27, #0x0]\n" "166:" // Height 8: Partial direct writeback: Done "b 168f\n" "167:" // Height 8: Full writeback - "str q24, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q25, [x27, #0x0]\n" - "str q26, [x26, #0x0]\n" - "str q27, [x25, #0x0]\n" - "str q28, [x24, #0x0]\n" - "str q29, [x23, #0x0]\n" - "str q30, [x22, #0x0]\n" - "str q31, [x21, #0x0]\n" + "str q24, [x6, #0x0]\n" + "add x6, x6, #0x10\n" + "str q25, [x13, #0x0]\n" + "str q26, [x12, #0x0]\n" + "str q27, [x11, #0x0]\n" + "str q28, [x10, #0x0]\n" + "str q29, [x9, #0x0]\n" + "str q30, [x28, #0x0]\n" + "str q31, [x27, #0x0]\n" "168:" // Height 8: Writeback done - "subs x17, x17, #0x4\n" + "subs x4, x4, #0x4\n" "bgt 149b\n" "subs %x[M], %x[M], #0x8\n" "beq 170f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 169f\n" - "add x20, x20, #0x8\n" - "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "add x27, x27, #0x8\n" + "str x27, [%x[args_ptr], %[offsetof_input_offset]]\n" "b 1b\n" "169:" // Update direct input - "mov x8, #0x20\n" - "madd %x[input_ptr], x8, x20, %x[input_ptr]\n" + "mov x26, #0x20\n" + "madd %x[input_ptr], x26, x27, %x[input_ptr]\n" "b 1b\n" "170:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x8", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp index bd22336c8d..004e5d7f23 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp @@ -92,7 +92,6 @@ void a64_hybrid_fp32_mla_8x4 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x8\n" "bge 148f\n" @@ -140,11 +139,11 @@ void a64_hybrid_fp32_mla_8x4 ( "9:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 10f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" "cbnz x10, 11f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -189,10 +188,10 @@ void a64_hybrid_fp32_mla_8x4 ( "14:" // Height 1: Multiply loop: Main loop skip "cbz x9, 16f\n" "15:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr q12, [x12, #0x0]\n" + "ldr s17, [x28], #0x4\n" + "ldr q16, [x12, #0x0]\n" "sub x9, x9, #0x1\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" + "fmla v24.4s, v16.4s, v17.s[0]\n" "add x12, x12, #0x10\n" "cbnz x9, 15b\n" "16:" // Height 1: Multiply loop: No odd multiplies @@ -271,12 +270,12 @@ void a64_hybrid_fp32_mla_8x4 ( "30:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" "cbnz x10, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -284,7 +283,7 @@ void a64_hybrid_fp32_mla_8x4 ( "b 32f\n" "31:" // Height 2: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" "32:" // Height 2: input setup done "cmp x9, #0x4\n" "blt 35f\n" @@ -337,12 +336,12 @@ void a64_hybrid_fp32_mla_8x4 ( "35:" // Height 2: Multiply loop: Main loop skip "cbz x9, 37f\n" "36:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s18, [x28], #0x4\n" + "ldr s17, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v18.s[0]\n" + "fmla v25.4s, v16.4s, v17.s[0]\n" "add x12, x12, #0x10\n" "cbnz x9, 36b\n" "37:" // Height 2: Multiply loop: No odd multiplies @@ -437,13 +436,13 @@ void a64_hybrid_fp32_mla_8x4 ( "51:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 52f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" "cbnz x10, 53f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -452,8 +451,8 @@ void a64_hybrid_fp32_mla_8x4 ( "b 53f\n" "52:" // Height 3: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" "53:" // Height 3: input setup done "cmp x9, #0x4\n" "blt 56f\n" @@ -520,14 +519,14 @@ void a64_hybrid_fp32_mla_8x4 ( "56:" // Height 3: Multiply loop: Main loop skip "cbz x9, 58f\n" "57:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s19, [x28], #0x4\n" + "ldr s18, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" + "ldr s17, [x26], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v19.s[0]\n" + "fmla v25.4s, v16.4s, v18.s[0]\n" + "fmla v26.4s, v16.4s, v17.s[0]\n" "add x12, x12, #0x10\n" "cbnz x9, 57b\n" "58:" // Height 3: Multiply loop: No odd multiplies @@ -637,14 +636,14 @@ void a64_hybrid_fp32_mla_8x4 ( "72:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" "cbnz x10, 74f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -654,9 +653,9 @@ void a64_hybrid_fp32_mla_8x4 ( "b 74f\n" "73:" // Height 4: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "74:" // Height 4: input setup done "cmp x9, #0x4\n" "blt 77f\n" @@ -737,17 +736,17 @@ void a64_hybrid_fp32_mla_8x4 ( "77:" // Height 4: Multiply loop: Main loop skip "cbz x9, 79f\n" "78:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s20, [x28], #0x4\n" + "ldr s19, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr s17, [x25], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v20.s[0]\n" + "fmla v25.4s, v16.4s, v19.s[0]\n" "add x12, x12, #0x10\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" + "fmla v26.4s, v16.4s, v18.s[0]\n" + "fmla v27.4s, v16.4s, v17.s[0]\n" "cbnz x9, 78b\n" "79:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -871,15 +870,15 @@ void a64_hybrid_fp32_mla_8x4 ( "93:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 94f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" "cbnz x10, 95f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -890,10 +889,10 @@ void a64_hybrid_fp32_mla_8x4 ( "b 95f\n" "94:" // Height 5: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "95:" // Height 5: input setup done "cmp x9, #0x4\n" "blt 98f\n" @@ -988,19 +987,19 @@ void a64_hybrid_fp32_mla_8x4 ( "98:" // Height 5: Multiply loop: Main loop skip "cbz x9, 100f\n" "99:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s21, [x28], #0x4\n" + "ldr s20, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" + "ldr s17, [x24], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v21.s[0]\n" + "fmla v25.4s, v16.4s, v20.s[0]\n" + "fmla v26.4s, v16.4s, v19.s[0]\n" + "fmla v27.4s, v16.4s, v18.s[0]\n" "add x12, x12, #0x10\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" + "fmla v28.4s, v16.4s, v17.s[0]\n" "cbnz x9, 99b\n" "100:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1139,16 +1138,16 @@ void a64_hybrid_fp32_mla_8x4 ( "114:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 115f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" "cbnz x10, 116f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1160,11 +1159,11 @@ void a64_hybrid_fp32_mla_8x4 ( "b 116f\n" "115:" // Height 6: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "116:" // Height 6: input setup done "cmp x9, #0x4\n" "blt 119f\n" @@ -1273,21 +1272,21 @@ void a64_hybrid_fp32_mla_8x4 ( "119:" // Height 6: Multiply loop: Main loop skip "cbz x9, 121f\n" "120:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s22, [x28], #0x4\n" + "ldr s21, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr s20, [x26], #0x4\n" + "ldr s19, [x25], #0x4\n" + "ldr s18, [x24], #0x4\n" + "ldr s17, [x23], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v22.s[0]\n" + "fmla v25.4s, v16.4s, v21.s[0]\n" "add x12, x12, #0x10\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" + "fmla v26.4s, v16.4s, v20.s[0]\n" + "fmla v27.4s, v16.4s, v19.s[0]\n" + "fmla v28.4s, v16.4s, v18.s[0]\n" + "fmla v29.4s, v16.4s, v17.s[0]\n" "cbnz x9, 120b\n" "121:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1441,17 +1440,17 @@ void a64_hybrid_fp32_mla_8x4 ( "135:" // Height 7: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 136f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" "cbnz x10, 137f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1464,12 +1463,12 @@ void a64_hybrid_fp32_mla_8x4 ( "b 137f\n" "136:" // Height 7: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "137:" // Height 7: input setup done "cmp x9, #0x4\n" "blt 140f\n" @@ -1592,23 +1591,23 @@ void a64_hybrid_fp32_mla_8x4 ( "140:" // Height 7: Multiply loop: Main loop skip "cbz x9, 142f\n" "141:" // Height 7: Multiply loop: Odd block loop - "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s23, [x28], #0x4\n" + "ldr s22, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" + "ldr s21, [x26], #0x4\n" + "ldr s20, [x25], #0x4\n" + "ldr s19, [x24], #0x4\n" + "ldr s18, [x23], #0x4\n" + "ldr s17, [x22], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v23.s[0]\n" + "fmla v25.4s, v16.4s, v22.s[0]\n" + "fmla v26.4s, v16.4s, v21.s[0]\n" + "fmla v27.4s, v16.4s, v20.s[0]\n" "add x12, x12, #0x10\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" - "fmla v30.4s, v12.4s, v6.s[0]\n" + "fmla v28.4s, v16.4s, v19.s[0]\n" + "fmla v29.4s, v16.4s, v18.s[0]\n" + "fmla v30.4s, v16.4s, v17.s[0]\n" "cbnz x9, 141b\n" "142:" // Height 7: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -1780,18 +1779,18 @@ void a64_hybrid_fp32_mla_8x4 ( "156:" // Height 8: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 157f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" - "ldr x21, [x21, #0x38]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x21, [x20, #0x38]\n" "cbnz x10, 158f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1805,13 +1804,13 @@ void a64_hybrid_fp32_mla_8x4 ( "b 158f\n" "157:" // Height 8: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "158:" // Height 8: input setup done "cmp x9, #0x4\n" "blt 161f\n" @@ -1949,24 +1948,24 @@ void a64_hybrid_fp32_mla_8x4 ( "cbz x9, 163f\n" "162:" // Height 8: Multiply loop: Odd block loop "ldr s0, [x28], #0x4\n" - "ldr s1, [x27], #0x4\n" + "ldr s23, [x27], #0x4\n" "sub x9, x9, #0x1\n" - "ldr s2, [x26], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x24], #0x4\n" - "ldr s5, [x23], #0x4\n" - "ldr s6, [x22], #0x4\n" - "ldr s7, [x21], #0x4\n" - "ldr q12, [x12, #0x0]\n" - "fmla v24.4s, v12.4s, v0.s[0]\n" - "fmla v25.4s, v12.4s, v1.s[0]\n" + "ldr s22, [x26], #0x4\n" + "ldr s21, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s19, [x23], #0x4\n" + "ldr s18, [x22], #0x4\n" + "ldr s17, [x21], #0x4\n" + "ldr q16, [x12, #0x0]\n" + "fmla v24.4s, v16.4s, v0.s[0]\n" + "fmla v25.4s, v16.4s, v23.s[0]\n" "add x12, x12, #0x10\n" - "fmla v26.4s, v12.4s, v2.s[0]\n" - "fmla v27.4s, v12.4s, v3.s[0]\n" - "fmla v28.4s, v12.4s, v4.s[0]\n" - "fmla v29.4s, v12.4s, v5.s[0]\n" - "fmla v30.4s, v12.4s, v6.s[0]\n" - "fmla v31.4s, v12.4s, v7.s[0]\n" + "fmla v26.4s, v16.4s, v22.s[0]\n" + "fmla v27.4s, v16.4s, v21.s[0]\n" + "fmla v28.4s, v16.4s, v20.s[0]\n" + "fmla v29.4s, v16.4s, v19.s[0]\n" + "fmla v30.4s, v16.4s, v18.s[0]\n" + "fmla v31.4s, v16.4s, v17.s[0]\n" "cbnz x9, 162b\n" "163:" // Height 8: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -2068,10 +2067,9 @@ void a64_hybrid_fp32_mla_8x4 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "170:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp index e6e7950979..f31dd7afd0 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -99,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp index a0ea96822a..0e468b196a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp @@ -93,7 +93,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 130f\n" @@ -255,11 +254,11 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "20:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 21f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 22f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -279,31 +278,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "23:" // Height 1: Multiply loop: Main loop head ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q24, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q23, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" + "ldr q22, [x28, #0x60]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q21, [x28, #0x70]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + "ldr q24, [x28, #0x80]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x90]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x28, #0xa0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0xb0]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x8\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" "add x28, x28, #0xc0\n" "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" "ldr q6, [x28, #0x20]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "ldr q7, [x28, #0x30]\n" "prfm pldl1keep, [x24, #0x80]\n" "ld1 { v0.4s }, [x24], #0x10\n" @@ -311,28 +310,28 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "24:" // Height 1: Multiply loop: Single iteration only ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q23, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q25, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" + "ldr q21, [x28, #0x60]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x6e57ec0a // bfmmla v10.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x80]\n" + ".inst 0x6e59ec10 // bfmmla v16.4s, v0.8h, v25.8h\n" + "ldr q22, [x28, #0x90]\n" + ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0xa0]\n" + ".inst 0x6e58ec11 // bfmmla v17.4s, v0.8h, v24.8h\n" + "ldr q5, [x28, #0xb0]\n" "sub x25, x25, #0x4\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec0c // bfmmla v12.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec12 // bfmmla v18.4s, v0.8h, v22.8h\n" "prfm pldl1keep, [x24, #0x80]\n" "add x28, x28, #0xc0\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec0d // bfmmla v13.4s, v0.8h, v21.8h\n" + ".inst 0x6e45ec13 // bfmmla v19.4s, v0.8h, v5.8h\n" "25:" // Height 1: Multiply loop: Main loop skip "cbz x25, 28f\n" "cbz x25, 28f\n" @@ -344,31 +343,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "26:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr s0, [x24, #0x0]\n" "27:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q21, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q4, [x28, #0x40]\n" - "ldr q5, [x28, #0x50]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q6, [x28, #0x60]\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q4, [x28, #0x80]\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - "ldr q6, [x28, #0xa0]\n" - "ldr q7, [x28, #0xb0]\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec08 // bfmmla v8.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0x20]\n" + "ldr q22, [x28, #0x30]\n" + ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n" + ".inst 0x6e55ec09 // bfmmla v9.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0x40]\n" + "ldr q23, [x28, #0x50]\n" + ".inst 0x6e56ec0f // bfmmla v15.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0a // bfmmla v10.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0x60]\n" + "ldr q22, [x28, #0x70]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + ".inst 0x6e55ec0b // bfmmla v11.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0x80]\n" + "ldr q23, [x28, #0x90]\n" + ".inst 0x6e56ec11 // bfmmla v17.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0c // bfmmla v12.4s, v0.8h, v21.8h\n" + "ldr q22, [x28, #0xa0]\n" + "ldr q21, [x28, #0xb0]\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x28, x28, #0xc0\n" "28:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -384,21 +383,21 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "uzp1 v13.2d, v13.2d, v19.2d\n" "tbz %x[flags], #1, 29f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v22.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v21.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v22.4s\n" + "fmin v9.4s, v9.4s, v22.4s\n" + "fmin v10.4s, v10.4s, v22.4s\n" + "fmin v11.4s, v11.4s, v22.4s\n" + "fmin v12.4s, v12.4s, v22.4s\n" + "fmin v13.4s, v13.4s, v22.4s\n" + "fmax v8.4s, v8.4s, v21.4s\n" + "fmax v9.4s, v9.4s, v21.4s\n" + "fmax v10.4s, v10.4s, v21.4s\n" + "fmax v11.4s, v11.4s, v21.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "fmax v13.4s, v13.4s, v21.4s\n" "29:" // Height 1: No activation "cmp x9, #0x18\n" "bge 42f\n" @@ -678,12 +677,12 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "63:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 64f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 65f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -691,7 +690,7 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "b 65f\n" "64:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "65:" // Height 2: input setup done "cmp x25, #0x4\n" "blt 68f\n" @@ -707,31 +706,31 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q3, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q23, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" + "ldr q22, [x28, #0x60]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q21, [x28, #0x70]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + "ldr q1, [x28, #0x80]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x90]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x28, #0xa0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0xb0]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x8\n" "add x28, x28, #0xc0\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e41ec0c // bfmmla v12.4s, v0.8h, v1.8h\n" "ldr q4, [x28, #0x0]\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "ldr q5, [x28, #0x10]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" "ldr q6, [x28, #0x20]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "ldr q7, [x28, #0x30]\n" "prfm pldl1keep, [x24, #0x80]\n" "ld1 { v0.4s }, [x24], #0x10\n" @@ -742,28 +741,28 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q24, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q23, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" + "ldr q22, [x28, #0x60]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q21, [x28, #0x70]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + "ldr q24, [x28, #0x80]\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q23, [x28, #0x90]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + "ldr q22, [x28, #0xa0]\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q21, [x28, #0xb0]\n" "sub x25, x25, #0x4\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x28, x28, #0xc0\n" "68:" // Height 2: Multiply loop: Main loop skip "cbz x25, 71f\n" @@ -779,32 +778,32 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "ldr s0, [x24, #0x0]\n" "ldr s1, [x23, #0x0]\n" "70:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q24, [x28, #0x0]\n" + "ldr q23, [x28, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - "ldr q4, [x28, #0x40]\n" - "ldr q5, [x28, #0x50]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "ldr q6, [x28, #0x60]\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - "ldr q4, [x28, #0x80]\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - "ldr q6, [x28, #0xa0]\n" - "ldr q7, [x28, #0xb0]\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + "ldr q22, [x28, #0x20]\n" + "ldr q21, [x28, #0x30]\n" + ".inst 0x6e58ec08 // bfmmla v8.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec0e // bfmmla v14.4s, v0.8h, v23.8h\n" + "ldr q24, [x28, #0x40]\n" + "ldr q23, [x28, #0x50]\n" + ".inst 0x6e56ec09 // bfmmla v9.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec0f // bfmmla v15.4s, v0.8h, v21.8h\n" + "ldr q22, [x28, #0x60]\n" + "ldr q21, [x28, #0x70]\n" + ".inst 0x6e58ec0a // bfmmla v10.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec10 // bfmmla v16.4s, v0.8h, v23.8h\n" + "ldr q24, [x28, #0x80]\n" + "ldr q23, [x28, #0x90]\n" + ".inst 0x6e56ec0b // bfmmla v11.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec11 // bfmmla v17.4s, v0.8h, v21.8h\n" + "ldr q22, [x28, #0xa0]\n" + "ldr q21, [x28, #0xb0]\n" + ".inst 0x6e58ec0c // bfmmla v12.4s, v0.8h, v24.8h\n" + ".inst 0x6e57ec12 // bfmmla v18.4s, v0.8h, v23.8h\n" + ".inst 0x6e56ec0d // bfmmla v13.4s, v0.8h, v22.8h\n" + ".inst 0x6e55ec13 // bfmmla v19.4s, v0.8h, v21.8h\n" "add x28, x28, #0xc0\n" "71:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -829,33 +828,33 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "uzp2 v13.2d, v13.2d, v19.2d\n" "tbz %x[flags], #1, 72f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v22.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v4.4s, v4.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" + "ld1r { v21.4s }, [x20]\n" + "fmin v4.4s, v4.4s, v22.4s\n" + "fmin v14.4s, v14.4s, v22.4s\n" + "fmin v15.4s, v15.4s, v22.4s\n" + "fmin v16.4s, v16.4s, v22.4s\n" + "fmin v17.4s, v17.4s, v22.4s\n" + "fmin v18.4s, v18.4s, v22.4s\n" + "fmin v8.4s, v8.4s, v22.4s\n" + "fmin v9.4s, v9.4s, v22.4s\n" + "fmin v10.4s, v10.4s, v22.4s\n" + "fmin v11.4s, v11.4s, v22.4s\n" + "fmin v12.4s, v12.4s, v22.4s\n" + "fmin v13.4s, v13.4s, v22.4s\n" + "fmax v4.4s, v4.4s, v21.4s\n" + "fmax v14.4s, v14.4s, v21.4s\n" + "fmax v15.4s, v15.4s, v21.4s\n" + "fmax v16.4s, v16.4s, v21.4s\n" + "fmax v17.4s, v17.4s, v21.4s\n" + "fmax v18.4s, v18.4s, v21.4s\n" + "fmax v8.4s, v8.4s, v21.4s\n" + "fmax v9.4s, v9.4s, v21.4s\n" + "fmax v10.4s, v10.4s, v21.4s\n" + "fmax v11.4s, v11.4s, v21.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "fmax v13.4s, v13.4s, v21.4s\n" "72:" // Height 2: No activation "cmp x9, #0x18\n" "bge 85f\n" @@ -1238,13 +1237,13 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "106:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 107f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 108f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1253,8 +1252,8 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "b 108f\n" "107:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "108:" // Height 3: input setup done "cmp x25, #0x4\n" "blt 111f\n" @@ -1285,7 +1284,7 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" + "ldr q3, [x28, #0x70]\n" ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" "prfm pldl1keep, [x23, #0x80]\n" "ld1 { v1.4s }, [x23], #0x10\n" @@ -1298,9 +1297,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + ".inst 0x6e43ec11 // bfmmla v17.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec5d // bfmmla v29.4s, v2.8h, v3.8h\n" + "ldr q3, [x28, #0xb0]\n" "add x28, x28, #0xc0\n" ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" @@ -1311,9 +1310,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" "ldr q6, [x28, #0x20]\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec13 // bfmmla v19.4s, v0.8h, v3.8h\n" "ld1 { v0.4s }, [x24], #0x10\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec5f // bfmmla v31.4s, v2.8h, v3.8h\n" "ld1 { v2.4s }, [x22], #0x10\n" "ldr q7, [x28, #0x30]\n" "bge 109b\n" @@ -1324,10 +1323,10 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "sub x25, x25, #0x4\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q3, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q4, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" @@ -1335,29 +1334,29 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" + ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x90]\n" ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q3, [x28, #0xa0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x28, #0xb0]\n" "add x28, x28, #0xc0\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "111:" // Height 3: Multiply loop: Main loop skip "cbz x25, 114f\n" "cbz x25, 114f\n" @@ -1375,46 +1374,46 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "ldr s1, [x23, #0x0]\n" "ldr s2, [x22, #0x0]\n" "113:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q5, [x28, #0x0]\n" + "ldr q4, [x28, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" + "ldr q3, [x28, #0x20]\n" + "ldr q1, [x28, #0x30]\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n" + "ldr q3, [x28, #0x60]\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5b // bfmmla v27.4s, v2.8h, v1.8h\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x90]\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + "ldr q3, [x28, #0xa0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x28, #0xb0]\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" "add x28, x28, #0xc0\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "114:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -1937,14 +1936,14 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "149:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 150f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 151f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1954,9 +1953,9 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "b 151f\n" "150:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "151:" // Height 4: input setup done "cmp x25, #0x4\n" "blt 154f\n" @@ -2033,39 +2032,39 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" + "ldr q3, [x28, #0x40]\n" ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" + "ldr q4, [x28, #0x50]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" "ldr q6, [x28, #0x60]\n" ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec56 // bfmmla v22.4s, v2.8h, v3.8h\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x90]\n" ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + "ldr q3, [x28, #0xa0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x28, #0xb0]\n" "add x28, x28, #0xc0\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "154:" // Height 4: Multiply loop: Main loop skip "cbz x25, 157f\n" "cbz x25, 157f\n" @@ -2086,47 +2085,47 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "ldr s2, [x22, #0x0]\n" "ldr s3, [x21, #0x0]\n" "156:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q4, [x28, #0x0]\n" - "ldr q5, [x28, #0x10]\n" + "ldr q5, [x28, #0x0]\n" + "ldr q4, [x28, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" - "ldr q6, [x28, #0x20]\n" - "ldr q7, [x28, #0x30]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" - ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x40]\n" - ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5a // bfmmla v26.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x50]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0x60]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5b // bfmmla v27.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0x70]\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - "ldr q4, [x28, #0x80]\n" - ".inst 0x6e45ec10 // bfmmla v16.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5c // bfmmla v28.4s, v2.8h, v5.8h\n" - "ldr q5, [x28, #0x90]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x28, #0xa0]\n" - ".inst 0x6e47ec11 // bfmmla v17.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5d // bfmmla v29.4s, v2.8h, v7.8h\n" - "ldr q7, [x28, #0xb0]\n" + ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x40]\n" + ".inst 0x6e44ec0e // bfmmla v14.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5a // bfmmla v26.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q3, [x28, #0x60]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec5b // bfmmla v27.4s, v2.8h, v6.8h\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + "ldr q5, [x28, #0x80]\n" + ".inst 0x6e44ec10 // bfmmla v16.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5c // bfmmla v28.4s, v2.8h, v4.8h\n" + "ldr q4, [x28, #0x90]\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + "ldr q3, [x28, #0xa0]\n" + ".inst 0x6e41ec11 // bfmmla v17.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5d // bfmmla v29.4s, v2.8h, v1.8h\n" + "ldr q1, [x28, #0xb0]\n" "add x28, x28, #0xc0\n" - ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" - ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec12 // bfmmla v18.4s, v0.8h, v5.8h\n" - ".inst 0x6e45ec5e // bfmmla v30.4s, v2.8h, v5.8h\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec13 // bfmmla v19.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec5f // bfmmla v31.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec12 // bfmmla v18.4s, v0.8h, v4.8h\n" + ".inst 0x6e44ec5e // bfmmla v30.4s, v2.8h, v4.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" + ".inst 0x6e41ec13 // bfmmla v19.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec5f // bfmmla v31.4s, v2.8h, v1.8h\n" "157:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -2415,7 +2414,6 @@ void a64_hybrid_fp32bf16fp32_mmla_4x24 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "174:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp index 39ffcbef12..71e16d68b5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -99,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp index 4993777d62..5693c3f397 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp @@ -93,7 +93,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( break; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 176f\n" @@ -211,11 +210,11 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "16:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -233,23 +232,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "19:" // Height 1: Multiply loop: Main loop head ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q18, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x8\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" "add x10, x10, #0x80\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "ldr q7, [x10, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "ld1 { v0.4s }, [x26], #0x10\n" @@ -257,20 +256,20 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "20:" // Height 1: Multiply loop: Single iteration only ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q18, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x80\n" "21:" // Height 1: Multiply loop: Main loop skip @@ -284,23 +283,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr s0, [x26, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ec08 // bfmmla v8.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec0c // bfmmla v12.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "add x10, x10, #0x80\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -314,17 +313,17 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "uzp1 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "25:" // Height 1: No activation "cmp x11, #0x10\n" "bge 34f\n" @@ -515,12 +514,12 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "51:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 52f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 53f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -528,7 +527,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "b 53f\n" "52:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "53:" // Height 2: input setup done "cmp x27, #0x4\n" "blt 56f\n" @@ -542,23 +541,23 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q18, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x8\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "ldr q7, [x10, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "ld1 { v0.4s }, [x26], #0x10\n" @@ -569,20 +568,20 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q18, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x4\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "add x10, x10, #0x80\n" @@ -600,24 +599,24 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ldr s0, [x26, #0x0]\n" "ldr s1, [x25, #0x0]\n" "58:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e52ec08 // bfmmla v8.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e51ec0c // bfmmla v12.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e52ec09 // bfmmla v9.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e51ec0d // bfmmla v13.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e52ec0a // bfmmla v10.4s, v0.8h, v18.8h\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e51ec0e // bfmmla v14.4s, v0.8h, v17.8h\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e52ec0b // bfmmla v11.4s, v0.8h, v18.8h\n" + ".inst 0x6e51ec0f // bfmmla v15.4s, v0.8h, v17.8h\n" "add x10, x10, #0x80\n" "59:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -638,25 +637,25 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "uzp2 v11.2d, v11.2d, v15.2d\n" "tbz %x[flags], #1, 60f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v18.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v6.4s, v6.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v6.4s, v6.4s, v18.4s\n" + "fmin v12.4s, v12.4s, v18.4s\n" + "fmin v13.4s, v13.4s, v18.4s\n" + "fmin v14.4s, v14.4s, v18.4s\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v6.4s, v6.4s, v17.4s\n" + "fmax v12.4s, v12.4s, v17.4s\n" + "fmax v13.4s, v13.4s, v17.4s\n" + "fmax v14.4s, v14.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" "60:" // Height 2: No activation "cmp x11, #0x10\n" "bge 69f\n" @@ -912,13 +911,13 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "86:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 87f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 88f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -927,8 +926,8 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "b 88f\n" "87:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "88:" // Height 3: input setup done "cmp x27, #0x4\n" "blt 91f\n" @@ -946,34 +945,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "sub x27, x27, #0x4\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q26, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" "cmp x27, #0x8\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" "prfm pldl1keep, [x25, #0x80]\n" "ld1 { v1.4s }, [x25], #0x10\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" "ld1 { v0.4s }, [x26], #0x10\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "ld1 { v2.4s }, [x24], #0x10\n" "ldr q7, [x10, #0x10]\n" "bge 89b\n" @@ -984,30 +983,30 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "sub x27, x27, #0x4\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q26, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "91:" // Height 3: Multiply loop: Main loop skip "cbz x27, 94f\n" "cbz x27, 94f\n" @@ -1025,34 +1024,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ldr s1, [x25, #0x0]\n" "ldr s2, [x24, #0x0]\n" "93:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e5aec08 // bfmmla v8.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec0c // bfmmla v12.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "94:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1078,33 +1077,33 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "uzp1 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 95f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v6.4s, v6.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v6.4s, v6.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v6.4s, v6.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "95:" // Height 3: No activation "cmp x11, #0x10\n" "bge 104f\n" @@ -1401,14 +1400,14 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "121:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 122f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 123f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1418,9 +1417,9 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "b 123f\n" "122:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "123:" // Height 4: input setup done "cmp x27, #0x4\n" "blt 126f\n" @@ -1442,34 +1441,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q26, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" "prfm pldl1keep, [x25, #0x80]\n" "ld1 { v1.4s }, [x25], #0x10\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" "prfm pldl1keep, [x23, #0x80]\n" "ld1 { v3.4s }, [x23], #0x10\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" "ld1 { v0.4s }, [x26], #0x10\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "ld1 { v2.4s }, [x24], #0x10\n" "ldr q7, [x10, #0x10]\n" "bge 124b\n" @@ -1483,29 +1482,29 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" "prfm pldl1keep, [x25, #0x80]\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q26, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "126:" // Height 4: Multiply loop: Main loop skip "cbz x27, 129f\n" "cbz x27, 129f\n" @@ -1526,35 +1525,35 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ldr s2, [x24, #0x0]\n" "ldr s3, [x23, #0x0]\n" "128:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e5aec08 // bfmmla v8.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec50 // bfmmla v16.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e59ec0c // bfmmla v12.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec54 // bfmmla v20.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e5aec09 // bfmmla v9.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec51 // bfmmla v17.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e59ec0d // bfmmla v13.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec55 // bfmmla v21.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e5aec0a // bfmmla v10.4s, v0.8h, v26.8h\n" + ".inst 0x6e5aec52 // bfmmla v18.4s, v2.8h, v26.8h\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e59ec0e // bfmmla v14.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec56 // bfmmla v22.4s, v2.8h, v25.8h\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e5aec0b // bfmmla v11.4s, v0.8h, v26.8h\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e5aec53 // bfmmla v19.4s, v2.8h, v26.8h\n" + ".inst 0x6e59ec0f // bfmmla v15.4s, v0.8h, v25.8h\n" + ".inst 0x6e59ec57 // bfmmla v23.4s, v2.8h, v25.8h\n" "129:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1586,41 +1585,41 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "uzp2 v19.2d, v19.2d, v23.2d\n" "tbz %x[flags], #1, 130f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1r { v1.4s }, [x20]\n" + "ld1r { v26.4s }, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v6.4s, v6.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v6.4s, v6.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v21.4s, v21.4s, v26.4s\n" + "fmin v22.4s, v22.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v6.4s, v6.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v15.4s, v15.4s, v25.4s\n" + "fmax v20.4s, v20.4s, v25.4s\n" + "fmax v21.4s, v21.4s, v25.4s\n" + "fmax v22.4s, v22.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" "130:" // Height 4: No activation "cmp x11, #0x10\n" "bge 139f\n" @@ -1982,15 +1981,15 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "156:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 157f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 158f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -2001,10 +2000,10 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "b 158f\n" "157:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "158:" // Height 5: input setup done "cmp x27, #0x4\n" "blt 161f\n" @@ -2029,43 +2028,43 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q3, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" "prfm pldl1keep, [x25, #0x80]\n" "ld1 { v1.4s }, [x25], #0x10\n" ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q5, [x10, #0x30]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n" "ldr q6, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" "ld1 { v3.4s }, [x23], #0x10\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n" + "ldr q5, [x10, #0x50]\n" ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e45ec0e // bfmmla v14.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + ".inst 0x6e45ec9e // bfmmla v30.4s, v4.8h, v5.8h\n" + "ldr q5, [x10, #0x70]\n" "add x10, x10, #0x80\n" ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e45ec0f // bfmmla v15.4s, v0.8h, v5.8h\n" "ld1 { v0.4s }, [x26], #0x10\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" "ld1 { v2.4s }, [x24], #0x10\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e45ec9f // bfmmla v31.4s, v4.8h, v5.8h\n" "ld1 { v4.4s }, [x22], #0x10\n" "ldr q7, [x10, #0x10]\n" "bge 159b\n" @@ -2081,37 +2080,37 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q3, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x40]\n" + ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x60]\n" + ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n" "161:" // Height 5: Multiply loop: Main loop skip "cbz x27, 164f\n" "cbz x27, 164f\n" @@ -2136,7 +2135,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ldr s4, [x22, #0x0]\n" "163:" // Height 5: Multiply loop: Ragged operand read: Done "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q5, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" @@ -2145,34 +2144,34 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + "ldr q3, [x10, #0x20]\n" + ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" + ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" + ".inst 0x6e45ec9c // bfmmla v28.4s, v4.8h, v5.8h\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x40]\n" + ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x60]\n" + ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x70]\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n" "164:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2658,16 +2657,16 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "191:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 192f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 193f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -2679,11 +2678,11 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "b 193f\n" "192:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "193:" // Height 6: input setup done "cmp x27, #0x4\n" "blt 196f\n" @@ -2716,7 +2715,7 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" + "ldr q5, [x10, #0x30]\n" "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" @@ -2724,10 +2723,10 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ld1 { v3.4s }, [x23], #0x10\n" ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" + ".inst 0x6e45ec9d // bfmmla v29.4s, v4.8h, v5.8h\n" "ldr q7, [x10, #0x50]\n" "prfm pldl1keep, [x21, #0x80]\n" "ld1 { v5.4s }, [x21], #0x10\n" @@ -2766,37 +2765,37 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" "prfm pldl1keep, [x24, #0x80]\n" ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" + "ldr q3, [x10, #0x20]\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" + "ldr q1, [x10, #0x30]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n" "prfm pldl1keep, [x21, #0x80]\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x40]\n" + ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x60]\n" + ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n" "196:" // Height 6: Multiply loop: Main loop skip "cbz x27, 199f\n" "cbz x27, 199f\n" @@ -2823,45 +2822,45 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "ldr s4, [x22, #0x0]\n" "ldr s5, [x21, #0x0]\n" "198:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" ".inst 0x0ea16800 // bfcvtn v0.4h, v0.4s\n" ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n" ".inst 0x0ea16884 // bfcvtn v4.4h, v4.4s\n" ".inst 0x4ea16820 // bfcvtn2 v0.8h, v1.4s\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" ".inst 0x4ea16862 // bfcvtn2 v2.8h, v3.4s\n" ".inst 0x4ea168a4 // bfcvtn2 v4.8h, v5.4s\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q3, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec51 // bfmmla v17.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec99 // bfmmla v25.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x40]\n" + ".inst 0x6e41ec0d // bfmmla v13.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9d // bfmmla v29.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e43ec0a // bfmmla v10.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec52 // bfmmla v18.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9a // bfmmla v26.4s, v4.8h, v3.8h\n" + "ldr q3, [x10, #0x60]\n" + ".inst 0x6e41ec0e // bfmmla v14.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec56 // bfmmla v22.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9e // bfmmla v30.4s, v4.8h, v1.8h\n" + "ldr q1, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e43ec53 // bfmmla v19.4s, v2.8h, v3.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e41ec0f // bfmmla v15.4s, v0.8h, v1.8h\n" + ".inst 0x6e41ec57 // bfmmla v23.4s, v2.8h, v1.8h\n" + ".inst 0x6e41ec9f // bfmmla v31.4s, v4.8h, v1.8h\n" "199:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3126,7 +3125,6 @@ void a64_hybrid_fp32bf16fp32_mmla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "212:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp index 905a60265c..bfc9c7e8f9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -108,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp index b31b80586c..eac0e7167e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp @@ -78,329 +78,328 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 91f\n" "cmp %x[M], #0x2\n" "bgt 61f\n" "beq 31f\n" - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[output_ptr]\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" "3:" // Height 1: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "cbnz x12, 6f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "cbnz x11, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" + "add x9, x9, x20\n" "b 6f\n" "5:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" + "mov x9, %x[input_ptr]\n" "6:" // Height 1: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 11f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr d4, [x13, #0x70]\n" - "ldr x9, [x13, #0x78]\n" + "ldr d21, [x12, #0x70]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d20, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d26, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" - "mov v4.d[1], x9\n" - "ldr x28, [x13, #0x88]\n" + "ldr d25, [x12, #0xa0]\n" + "mov v21.d[1], x20\n" + "ldr x20, [x12, #0x88]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d24, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d23, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - "mov v5.d[1], x28\n" - "ldr x27, [x13, #0x98]\n" - "mov v6.d[1], x27\n" - "ldr x26, [x13, #0xa8]\n" - "mov v7.d[1], x26\n" - "ldr x25, [x13, #0xb8]\n" - "mov v8.d[1], x25\n" - "ldr x24, [x13, #0xc8]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "ldr x20, [x13, #0xd8]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - "ldr x9, [x13, #0xe8]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - "ldr x28, [x13, #0xf8]\n" - "mov v9.d[1], x24\n" - "mov v10.d[1], x20\n" - "add x10, x10, #0x10\n" - "mov v4.d[1], x9\n" - "add x13, x13, #0x100\n" - "mov v5.d[1], x28\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "ldr d22, [x12, #0xd0]\n" + ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" + "ldr d21, [x12, #0xe0]\n" + "mov v20.d[1], x20\n" + "ldr x20, [x12, #0x98]\n" + "mov v26.d[1], x20\n" + "ldr x20, [x12, #0xa8]\n" + "mov v25.d[1], x20\n" + "ldr x20, [x12, #0xb8]\n" + "mov v24.d[1], x20\n" + "ldr x23, [x12, #0xc8]\n" + ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" + "ldr d20, [x12, #0xf0]\n" + ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" + "ldr x22, [x12, #0xd8]\n" + ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" + "ldr x21, [x12, #0xe8]\n" + ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" + "ldr x20, [x12, #0xf8]\n" + "mov v23.d[1], x23\n" + "mov v22.d[1], x22\n" + "add x9, x9, #0x10\n" + "mov v21.d[1], x21\n" + "add x12, x12, #0x100\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q4, [x13, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q4, [x12, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q21, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q20, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q26, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q25, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q24, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q23, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "sub x11, x11, #0x10\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - "add x10, x10, #0x10\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + "ldr q22, [x12, #0xd0]\n" + ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x12, #0xe0]\n" + ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x12, #0xf0]\n" + ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" + "sub x10, x10, #0x10\n" + ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" + "add x9, x9, #0x10\n" + ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "11:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 18f\n" - "cmp x11, #0x4\n" + "cbz x10, 18f\n" + "cmp x10, #0x4\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" + "ldr s0, [x9], #0x4\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q22, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q21, [x12, #0x20]\n" + ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x30]\n" + ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" + ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks - "cbz x11, 18f\n" - "tbz x11, #1, 15f\n" - "ldr h0, [x10], #0x2\n" - "tbz x11, #0, 16f\n" - "ld1 { v0.b }[2], [x10]\n" + "cbz x10, 18f\n" + "tbz x10, #1, 15f\n" + "ldr h0, [x9], #0x2\n" + "tbz x10, #0, 16f\n" + "ld1 { v0.b }[2], [x9]\n" "b 16f\n" "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" + "ldr b0, [x9, #0x0]\n" "16:" // Height 1: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 17f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" + "ldr q20, [x12, #0x0]\n" + ".inst 0x4f80e290 // sdot v16.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x10]\n" + ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x20]\n" + ".inst 0x4f80e292 // sdot v18.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x30]\n" + ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 4b\n" - "prfm pstl1keep, [x14, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" - "neg v1.4s, v1.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v20.4s }, [x20]\n" + "neg v20.4s, v20.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v20.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q23, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q22, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q21, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q20, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v23.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v20.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v20.4s\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 20f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v0.16b\n" + "and v21.16b, v18.16b, v0.16b\n" + "and v20.16b, v19.16b, v0.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v20.4s }, [x20]\n" + "add v16.4s, v16.4s, v20.4s\n" + "add v17.4s, v17.4s, v20.4s\n" + "add v18.4s, v18.4s, v20.4s\n" + "add v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v20.4s }, [x20]\n" + "smin v16.4s, v16.4s, v20.4s\n" + "smin v17.4s, v17.4s, v20.4s\n" + "smin v18.4s, v18.4s, v20.4s\n" + "smin v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "cmp x15, #0x10\n" + "cmp x14, #0x10\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 29f\n" - "tbz x15, #3, 24f\n" - "str d16, [x14], #0x8\n" - "tbz x15, #2, 22f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "tbz x15, #1, 21f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[14], [x14]\n" + "tbz x14, #3, 24f\n" + "str d16, [x13], #0x8\n" + "tbz x14, #2, 22f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "tbz x14, #1, 21f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[14], [x13]\n" "b 28f\n" "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[12], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[12], [x13]\n" "b 28f\n" "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 23f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[10], [x14]\n" + "tbz x14, #1, 23f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[10], [x13]\n" "b 28f\n" "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[8], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[8], [x13]\n" "b 28f\n" "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 26f\n" - "str s16, [x14], #0x4\n" - "tbz x15, #1, 25f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[6], [x14]\n" + "tbz x14, #2, 26f\n" + "str s16, [x13], #0x4\n" + "tbz x14, #1, 25f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[6], [x13]\n" "b 28f\n" "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[4], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[4], [x13]\n" "b 28f\n" "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 27f\n" - "str h16, [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[2], [x14]\n" + "tbz x14, #1, 27f\n" + "str h16, [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[2], [x13]\n" "b 28f\n" "27:" // Height 1: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" + "str b16, [x13, #0x0]\n" "28:" // Height 1: Partial direct writeback: Done "b 30f\n" "29:" // Height 1: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" "30:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 2b\n" "b 122f\n" "31:" // Height 2 - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v15.16b, #0x1\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" "32:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" @@ -411,307 +410,307 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "33:" // Height 2: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 35f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "cbnz x12, 36f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x11, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" "b 36f\n" "35:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" "36:" // Height 2: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 41f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 39f\n" "37:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d25, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v25.d[1], x20\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d24, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x23, [x12, #0x88]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d30, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x22, [x12, #0x98]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr d29, [x12, #0xa0]\n" + "ldr x21, [x12, #0xa8]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr d28, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d27, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "mov v5.d[1], x28\n" + "mov v24.d[1], x23\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "mov v6.d[1], x27\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - "mov v7.d[1], x26\n" - "ldr x24, [x13, #0xc8]\n" - "mov v8.d[1], x25\n" - "ldr x20, [x13, #0xd8]\n" - "ldr x9, [x13, #0xe8]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - "ldr x28, [x13, #0xf8]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - "mov v9.d[1], x24\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - "mov v10.d[1], x20\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - "mov v4.d[1], x9\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - "add x10, x10, #0x10\n" - "add x23, x23, #0x10\n" - "add x13, x13, #0x100\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "ldr d26, [x12, #0xd0]\n" + ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" + "mov v30.d[1], x22\n" + ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" + "ldr d25, [x12, #0xe0]\n" + "mov v29.d[1], x21\n" + "ldr x23, [x12, #0xc8]\n" + "mov v28.d[1], x20\n" + "ldr x22, [x12, #0xd8]\n" + "ldr x21, [x12, #0xe8]\n" + ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" + "ldr d24, [x12, #0xf0]\n" + "ldr x20, [x12, #0xf8]\n" + ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" + "mov v27.d[1], x23\n" + ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" + "mov v26.d[1], x22\n" + ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" + "add x9, x9, #0x10\n" + "add x28, x28, #0x10\n" + "add x12, x12, #0x100\n" + ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 38f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "38:" // Height 2: Multiply loop: unique 5: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "bge 37b\n" "39:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q25, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q24, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q30, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q29, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q28, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q27, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + "ldr q26, [x12, #0xd0]\n" + ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x12, #0xe0]\n" + ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x12, #0xf0]\n" + ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "40:" // Height 2: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "41:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 48f\n" - "cmp x11, #0x4\n" + "cbz x10, 48f\n" + "cmp x10, #0x4\n" "blt 44f\n" "42:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" "tbnz %x[flags], #31, 43f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + "ldr q27, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q26, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q25, [x12, #0x20]\n" + ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" + "ldr q24, [x12, #0x30]\n" + ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" + ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" "bge 42b\n" "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x11, 48f\n" - "tbz x11, #1, 45f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "tbz x11, #0, 46f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" + "cbz x10, 48f\n" + "tbz x10, #1, 45f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "tbz x10, #0, 46f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" "b 46f\n" "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" "46:" // Height 2: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 47f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + "ldr q24, [x12, #0x0]\n" + ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n" + "ldr q26, [x12, #0x10]\n" + ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n" + "ldr q25, [x12, #0x20]\n" + ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" + "ldr q24, [x12, #0x30]\n" + ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" "48:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 34b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "add x23, x13, x20\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "tbnz %x[flags], #31, 49f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" - "neg v2.4s, v2.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" + "neg v24.4s, v24.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "49:" // Height 2: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q27, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q26, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q25, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q24, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v27.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v27.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 50f\n" - "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" + "and v24.16b, v16.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v24.4s\n" + "and v30.16b, v17.16b, v0.16b\n" + "and v29.16b, v18.16b, v0.16b\n" + "and v28.16b, v19.16b, v0.16b\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v0.16b\n" + "and v25.16b, v22.16b, v0.16b\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "50:" // Height 2: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -721,122 +720,122 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v24.4s }, [x20]\n" + "add v16.4s, v16.4s, v24.4s\n" + "add v17.4s, v17.4s, v24.4s\n" + "add v18.4s, v18.4s, v24.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v24.4s\n" + "add v21.4s, v21.4s, v24.4s\n" + "add v22.4s, v22.4s, v24.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v24.4s }, [x20]\n" + "smin v16.4s, v16.4s, v24.4s\n" + "smin v17.4s, v17.4s, v24.4s\n" + "smin v18.4s, v18.4s, v24.4s\n" + "smin v19.4s, v19.4s, v24.4s\n" + "smin v20.4s, v20.4s, v24.4s\n" + "smin v21.4s, v21.4s, v24.4s\n" + "smin v22.4s, v22.4s, v24.4s\n" + "smin v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 59f\n" - "tbz x15, #3, 54f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "tbz x15, #2, 52f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "tbz x15, #1, 51f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" + "tbz x14, #3, 54f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "tbz x14, #2, 52f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "tbz x14, #1, 51f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" "b 58f\n" "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" "b 58f\n" "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 53f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" + "tbz x14, #1, 53f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" "b 58f\n" "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" "b 58f\n" "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 56f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "tbz x15, #1, 55f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" + "tbz x14, #2, 56f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "tbz x14, #1, 55f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" "b 58f\n" "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" "b 58f\n" "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 57f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" + "tbz x14, #1, 57f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" "b 58f\n" "57:" // Height 2: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" "58:" // Height 2: Partial direct writeback: Done "b 60f\n" "59:" // Height 2: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" "60:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 32b\n" "b 122f\n" "61:" // Height 3 - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" "62:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" @@ -851,317 +850,317 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" "63:" // Height 3: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 65f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "cbnz x12, 66f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "cbnz x11, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" + "add x27, x27, x20\n" "b 66f\n" "65:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" - "add x22, x23, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" + "add x27, x28, x21\n" "66:" // Height 3: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 71f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x27, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 69f\n" "67:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x23, [x12, #0x88]\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d29, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v29.d[1], x20\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x22, [x12, #0x98]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d28, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr x21, [x12, #0xa8]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr x20, [x12, #0xb8]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d5, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "mov v5.d[1], x28\n" + "mov v28.d[1], x23\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "mov v6.d[1], x27\n" + "mov v5.d[1], x22\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" + "ldr d4, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "mov v7.d[1], x26\n" + "mov v4.d[1], x21\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr x24, [x13, #0xc8]\n" + "ldr x23, [x12, #0xc8]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d3, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "mov v8.d[1], x25\n" + "mov v3.d[1], x20\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr x20, [x13, #0xd8]\n" + "ldr x22, [x12, #0xd8]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d31, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr x9, [x13, #0xe8]\n" + "ldr x21, [x12, #0xe8]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr x28, [x13, #0xf8]\n" + "ldr x20, [x12, #0xf8]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "mov v9.d[1], x24\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "mov v10.d[1], x20\n" - ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x9\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "add x10, x10, #0x10\n" - ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" - "add x22, x22, #0x10\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "ldr d30, [x12, #0xd0]\n" + ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" + "mov v31.d[1], x23\n" + ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" + "mov v30.d[1], x22\n" + ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" + "ldr d29, [x12, #0xe0]\n" + ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" + "add x9, x9, #0x10\n" + ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" + "ldr d28, [x12, #0xf0]\n" + ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" + "add x27, x27, #0x10\n" + ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 68f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "68:" // Height 3: Multiply loop: unique 9: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x27, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" "bge 67b\n" "69:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q29, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q28, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q5, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q4, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q3, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q31, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + "ldr q30, [x12, #0xd0]\n" + ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x12, #0xe0]\n" + ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x12, #0xf0]\n" + ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 70f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "70:" // Height 3: Multiply loop: unique 10: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" "71:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 78f\n" - "cmp x11, #0x4\n" + "cbz x10, 78f\n" + "cmp x10, #0x4\n" "blt 74f\n" "72:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" - "ldr s2, [x22], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x27], #0x4\n" "tbnz %x[flags], #31, 73f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + "ldr q31, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q30, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q29, [x12, #0x20]\n" + ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" + "ldr q28, [x12, #0x30]\n" + ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" + ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" "bge 72b\n" "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x11, 78f\n" - "tbz x11, #1, 75f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "ldr h2, [x22], #0x2\n" - "tbz x11, #0, 76f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" - "ld1 { v2.b }[2], [x22]\n" + "cbz x10, 78f\n" + "tbz x10, #1, 75f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "tbz x10, #0, 76f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x27]\n" "b 76f\n" "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" - "ldr b2, [x22, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x27, #0x0]\n" "76:" // Height 3: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 77f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + "ldr q28, [x12, #0x0]\n" + ".inst 0x4f80e390 // sdot v16.4s, v28.16b, v0.4b[0]\n" + "ldr q30, [x12, #0x10]\n" + ".inst 0x4f81e394 // sdot v20.4s, v28.16b, v1.4b[0]\n" + "ldr q29, [x12, #0x20]\n" + ".inst 0x4f82e398 // sdot v24.4s, v28.16b, v2.4b[0]\n" + "ldr q28, [x12, #0x30]\n" + ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" "78:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" - "add x21, x22, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" + "add x23, x13, x20\n" + "add x22, x23, x20\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbnz %x[flags], #31, 79f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" - "neg v3.4s, v3.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v28.4s }, [x20]\n" + "neg v28.4s, v28.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v28.4s\n" + "mul v12.4s, v12.4s, v28.4s\n" + "mul v13.4s, v13.4s, v28.4s\n" "79:" // Height 3: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q31, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q30, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q29, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q28, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1171,73 +1170,73 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v31.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v31.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v31.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v28.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v28.4s\n" + "sqrdmulh v17.4s, v17.4s, v28.4s\n" + "sqrdmulh v18.4s, v18.4s, v28.4s\n" + "sqrdmulh v19.4s, v19.4s, v28.4s\n" + "sqrdmulh v20.4s, v20.4s, v28.4s\n" + "sqrdmulh v21.4s, v21.4s, v28.4s\n" + "sqrdmulh v22.4s, v22.4s, v28.4s\n" + "sqrdmulh v23.4s, v23.4s, v28.4s\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 80f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v16.16b, v0.16b\n" + "and v31.16b, v17.16b, v0.16b\n" + "and v30.16b, v18.16b, v0.16b\n" + "and v29.16b, v19.16b, v0.16b\n" + "and v28.16b, v20.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v30.4s\n" + "sqadd v19.4s, v19.4s, v29.4s\n" + "sqadd v20.4s, v20.4s, v28.4s\n" + "and v3.16b, v21.16b, v0.16b\n" + "and v2.16b, v22.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v0.16b\n" + "and v29.16b, v26.16b, v0.16b\n" + "and v28.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v3.4s\n" + "sqadd v22.4s, v22.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "80:" // Height 3: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -1251,156 +1250,156 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v28.4s }, [x20]\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "add v18.4s, v18.4s, v28.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v28.4s\n" + "add v21.4s, v21.4s, v28.4s\n" + "add v22.4s, v22.4s, v28.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v28.4s\n" + "add v25.4s, v25.4s, v28.4s\n" + "add v26.4s, v26.4s, v28.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v23.4s, v23.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 89f\n" - "tbz x15, #3, 84f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "tbz x15, #2, 82f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "tbz x15, #1, 81f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "tbz x14, #3, 84f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x14, #2, 82f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "tbz x14, #1, 81f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 88f\n" "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 88f\n" "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 83f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "tbz x14, #1, 83f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 88f\n" "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 88f\n" "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 86f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "tbz x15, #1, 85f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "tbz x14, #2, 86f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "tbz x14, #1, 85f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 88f\n" "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 88f\n" "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 87f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "tbz x14, #1, 87f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 88f\n" "87:" // Height 3: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "88:" // Height 3: Partial direct writeback: Done "b 90f\n" "89:" // Height 3: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "90:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 62b\n" "b 122f\n" "91:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "mov x20, #0x4\n" - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" - "mov x14, %x[output_ptr]\n" + "mov x13, %x[output_ptr]\n" "madd %x[output_ptr], x21, x20, %x[output_ptr]\n" "92:" // Height 4: Column loop "movi v16.4s, #0x0\n" @@ -1420,117 +1419,117 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" "93:" // Height 4: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 95f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" - "cbnz x12, 96f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x26, [x20, #0x18]\n" + "cbnz x11, 96f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" - "add x21, x21, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" + "add x27, x27, x20\n" + "add x26, x26, x20\n" "b 96f\n" "95:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" + "add x27, x28, x21\n" + "add x26, x27, x21\n" "96:" // Height 4: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 101f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q3, [x21, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x26, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 99f\n" "97:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x22, [x12, #0x78]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x21, [x12, #0x88]\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x20, [x12, #0x98]\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d4, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v4.d[1], x22\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr x25, [x12, #0xa8]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr x24, [x12, #0xb8]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d5, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "mov v5.d[1], x28\n" + "mov v5.d[1], x21\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x24, [x13, #0xc8]\n" + "ldr x23, [x12, #0xc8]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr x20, [x13, #0xd8]\n" + "ldr x22, [x12, #0xd8]\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d6, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x27\n" + "mov v6.d[1], x20\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x9, [x13, #0xe8]\n" + "ldr x21, [x12, #0xe8]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr x28, [x13, #0xf8]\n" + "ldr x20, [x12, #0xf8]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" + "ldr d7, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "mov v7.d[1], x26\n" + "mov v7.d[1], x25\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d8, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "mov v8.d[1], x25\n" + "mov v8.d[1], x24\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "add x21, x21, #0x10\n" + "add x26, x26, #0x10\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d9, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "mov v9.d[1], x24\n" + "mov v9.d[1], x23\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" + "ldr d10, [x12, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "mov v10.d[1], x20\n" + "mov v10.d[1], x22\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" + "ldr d4, [x12, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x9\n" + "mov v4.d[1], x21\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" + "ldr d5, [x12, #0xf0]\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" + "mov v5.d[1], x20\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - "add x13, x13, #0x100\n" + "add x12, x12, #0x100\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" @@ -1563,77 +1562,77 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "98:" // Height 4: Multiply loop: unique 13: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q2, [x22, #0x0]\n" - "ldr q3, [x21, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x26, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "bge 97b\n" "99:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q4, [x12, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "add x21, x21, #0x10\n" + "add x26, x26, #0x10\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q5, [x12, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q6, [x12, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q7, [x12, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q8, [x12, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q9, [x12, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" + "ldr q10, [x12, #0xd0]\n" ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" + "ldr q4, [x12, #0xe0]\n" ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" + "ldr q5, [x12, #0xf0]\n" ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" + "add x12, x12, #0x100\n" ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n" @@ -1667,67 +1666,67 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "100:" // Height 4: Multiply loop: unique 14: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "101:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 108f\n" - "cmp x11, #0x4\n" + "cbz x10, 108f\n" + "cmp x10, #0x4\n" "blt 104f\n" "102:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" - "ldr s2, [x22], #0x4\n" - "ldr s3, [x21], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x26], #0x4\n" "tbnz %x[flags], #31, 103f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q6, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q5, [x12, #0x20]\n" + ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x12, #0x30]\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" "bge 102b\n" "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x11, 108f\n" - "tbz x11, #1, 105f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "ldr h2, [x22], #0x2\n" - "ldr h3, [x21], #0x2\n" - "tbz x11, #0, 106f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" - "ld1 { v2.b }[2], [x22]\n" - "ld1 { v3.b }[2], [x21]\n" + "cbz x10, 108f\n" + "tbz x10, #1, 105f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h3, [x26], #0x2\n" + "tbz x10, #0, 106f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x27]\n" + "ld1 { v3.b }[2], [x26]\n" "b 106f\n" "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" - "ldr b2, [x22, #0x0]\n" - "ldr b3, [x21, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x27, #0x0]\n" + "ldr b3, [x26, #0x0]\n" "106:" // Height 4: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 107f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" @@ -1735,64 +1734,64 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x0]\n" + ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x12, #0x10]\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x12, #0x20]\n" + ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" + "ldr q4, [x12, #0x30]\n" + ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" "108:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 94b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" + "add x23, x13, x20\n" + "add x22, x23, x20\n" "add x21, x22, x20\n" - "add x20, x21, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" "prfm pstl1keep, [x21, #0x0]\n" - "prfm pstl1keep, [x20, #0x0]\n" "tbnz %x[flags], #31, 109f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "neg v4.4s, v4.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" + "neg v0.4s, v0.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "109:" // Height 4: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q3, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q2, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q1, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q0, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1806,93 +1805,93 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "add v29.4s, v29.4s, v14.4s\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v0.4s\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v2.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v1.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v2.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v1.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 110f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v16.16b, v0.16b\n" + "and v1.16b, v17.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v2.4s\n" + "sqadd v17.4s, v17.4s, v1.4s\n" + "and v7.16b, v18.16b, v0.16b\n" + "and v6.16b, v19.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v4.16b, v21.16b, v0.16b\n" + "and v3.16b, v22.16b, v0.16b\n" + "and v2.16b, v23.16b, v0.16b\n" + "and v1.16b, v24.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" - "and v10.16b, v29.16b, v0.16b\n" - "and v4.16b, v30.16b, v0.16b\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v7.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "sqadd v22.4s, v22.4s, v3.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "sqadd v24.4s, v24.4s, v1.4s\n" + "and v7.16b, v25.16b, v0.16b\n" + "and v6.16b, v26.16b, v0.16b\n" + "and v5.16b, v27.16b, v0.16b\n" + "and v4.16b, v28.16b, v0.16b\n" + "and v3.16b, v29.16b, v0.16b\n" + "and v2.16b, v30.16b, v0.16b\n" + "and v1.16b, v31.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sqadd v28.4s, v28.4s, v9.4s\n" - "sqadd v29.4s, v29.4s, v10.4s\n" - "sqadd v30.4s, v30.4s, v4.4s\n" - "sqadd v31.4s, v31.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "sqadd v29.4s, v29.4s, v3.4s\n" + "sqadd v30.4s, v30.4s, v2.4s\n" + "sqadd v31.4s, v31.4s, v1.4s\n" "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -1910,172 +1909,172 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v0.4s\n" + "add v18.4s, v18.4s, v0.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v0.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v0.4s\n" + "add v30.4s, v30.4s, v0.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v0.4s }, [x20]\n" + "smin v16.4s, v16.4s, v0.4s\n" + "smin v17.4s, v17.4s, v0.4s\n" + "smin v18.4s, v18.4s, v0.4s\n" + "smin v19.4s, v19.4s, v0.4s\n" + "smin v20.4s, v20.4s, v0.4s\n" + "smin v21.4s, v21.4s, v0.4s\n" + "smin v22.4s, v22.4s, v0.4s\n" + "smin v23.4s, v23.4s, v0.4s\n" + "smin v24.4s, v24.4s, v0.4s\n" + "smin v25.4s, v25.4s, v0.4s\n" + "smin v26.4s, v26.4s, v0.4s\n" + "smin v27.4s, v27.4s, v0.4s\n" + "smin v28.4s, v28.4s, v0.4s\n" + "smin v29.4s, v29.4s, v0.4s\n" + "smin v30.4s, v30.4s, v0.4s\n" + "smin v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v0.4s }, [x20]\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smax v21.4s, v21.4s, v0.4s\n" + "smax v22.4s, v22.4s, v0.4s\n" + "smax v23.4s, v23.4s, v0.4s\n" + "smax v24.4s, v24.4s, v0.4s\n" + "smax v25.4s, v25.4s, v0.4s\n" + "smax v26.4s, v26.4s, v0.4s\n" + "smax v27.4s, v27.4s, v0.4s\n" + "smax v28.4s, v28.4s, v0.4s\n" + "smax v29.4s, v29.4s, v0.4s\n" + "smax v30.4s, v30.4s, v0.4s\n" + "smax v31.4s, v31.4s, v0.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 119f\n" - "tbz x15, #3, 114f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" - "tbz x15, #2, 112f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" - "tbz x15, #1, 111f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "tbz x14, #3, 114f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x14, #2, 112f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x14, #1, 111f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 118f\n" "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 118f\n" "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 113f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "tbz x14, #1, 113f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 118f\n" "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 118f\n" "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 116f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" - "tbz x15, #1, 115f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "tbz x14, #2, 116f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x14, #1, 115f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 118f\n" "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 118f\n" "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 117f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "tbz x14, #1, 117f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 118f\n" "117:" // Height 4: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "118:" // Height 4: Partial direct writeback: Done "b 120f\n" "119:" // Height 4: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "120:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 92b\n" "subs %x[M], %x[M], #0x4\n" "beq 122f\n" @@ -2089,10 +2088,9 @@ void a64_hybrid_s8qa_dot_4x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "122:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp index 485a47dc67..3b773a6827 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp @@ -78,7 +78,6 @@ void a64_hybrid_s8qa_dot_4x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 91f\n" @@ -102,11 +101,11 @@ void a64_hybrid_s8qa_dot_4x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -128,32 +127,32 @@ void a64_hybrid_s8qa_dot_4x16 ( "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q21, [x28, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q20, [x28, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q26, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q25, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q24, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q23, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q22, [x28, #0xd0]\n" + ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x28, #0xe0]\n" + ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x28, #0xf0]\n" + ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" "add x24, x24, #0x10\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum @@ -171,33 +170,33 @@ void a64_hybrid_s8qa_dot_4x16 ( "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q21, [x28, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q20, [x28, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q26, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q25, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q24, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q23, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q22, [x28, #0xd0]\n" + ".inst 0x4fa0e2b3 // sdot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x28, #0xe0]\n" + ".inst 0x4f80ea90 // sdot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x28, #0xf0]\n" + ".inst 0x4f80eb51 // sdot v17.4s, v26.16b, v0.4b[2]\n" "sub x25, x25, #0x10\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4f80eb32 // sdot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f80eb13 // sdot v19.4s, v24.16b, v0.4b[2]\n" "add x24, x24, #0x10\n" "add x28, x28, #0x100\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4fa0eaf0 // sdot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x4fa0ead1 // sdot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x4fa0eab2 // sdot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa0ea93 // sdot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -211,16 +210,16 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q23, [x28, #0x0]\n" + "ldr q22, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x20]\n" + "ldr q20, [x28, #0x30]\n" + ".inst 0x4f80e2f0 // sdot v16.4s, v23.16b, v0.4b[0]\n" + ".inst 0x4f80e2d1 // sdot v17.4s, v22.16b, v0.4b[0]\n" + ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks @@ -236,14 +235,14 @@ void a64_hybrid_s8qa_dot_4x16 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x0]\n" + "ldr q20, [x28, #0x10]\n" + ".inst 0x4f80e2b0 // sdot v16.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f80e291 // sdot v17.4s, v20.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x20]\n" + "ldr q20, [x28, #0x30]\n" + ".inst 0x4f80e2b2 // sdot v18.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f80e293 // sdot v19.4s, v20.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -252,72 +251,72 @@ void a64_hybrid_s8qa_dot_4x16 ( "bne 4b\n" "prfm pstl1keep, [x27, #0x0]\n" "tbnz %x[flags], #31, 19f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v20.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v1.4s, v1.4s\n" + "neg v20.4s, v20.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v20.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q24, [x10, #0x0]\n" + "ldr q23, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q22, [x10, #0x20]\n" + "ldr q21, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v20.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "add v16.4s, v16.4s, v24.4s\n" + "add v17.4s, v17.4s, v23.4s\n" + "add v18.4s, v18.4s, v22.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v19.4s, v19.4s, v21.4s\n" + "sqrdmulh v16.4s, v16.4s, v20.4s\n" "add x10, x10, #0x40\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" "tbz %x[flags], #5, 20f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v0.16b\n" + "and v21.16b, v18.16b, v0.16b\n" + "and v20.16b, v19.16b, v0.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "20:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v22.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v21.4s }, [x20]\n" + "add v16.4s, v16.4s, v22.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" + "add v18.4s, v18.4s, v22.4s\n" + "add v19.4s, v19.4s, v22.4s\n" "cmp x9, #0x10\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "smin v16.4s, v16.4s, v21.4s\n" + "smin v17.4s, v17.4s, v21.4s\n" + "smin v18.4s, v18.4s, v21.4s\n" + "smin v19.4s, v19.4s, v21.4s\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" @@ -397,12 +396,12 @@ void a64_hybrid_s8qa_dot_4x16 ( "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 35f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -410,7 +409,7 @@ void a64_hybrid_s8qa_dot_4x16 ( "b 36f\n" "35:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "36:" // Height 2: input setup done "cmp x25, #0x10\n" "blt 41f\n" @@ -428,48 +427,48 @@ void a64_hybrid_s8qa_dot_4x16 ( "37:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "add x23, x23, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x28, #0xf0]\n" "add x28, x28, #0x100\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 38f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" @@ -491,49 +490,49 @@ void a64_hybrid_s8qa_dot_4x16 ( "39:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" "sub x25, x25, #0x10\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" "add x23, x23, #0x10\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4fa0e333 // sdot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e337 // sdot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4f80eb10 // sdot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb14 // sdot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x28, #0xf0]\n" "add x28, x28, #0x100\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x4f80ebb2 // sdot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebb6 // sdot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f80eb93 // sdot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb97 // sdot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4fa0eb70 // sdot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x4fa1eb74 // sdot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x4fa0eb51 // sdot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x4fa1eb55 // sdot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb36 // sdot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa0eb13 // sdot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb17 // sdot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" @@ -551,21 +550,21 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q27, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x4f80e370 // sdot v16.4s, v27.16b, v0.4b[0]\n" + ".inst 0x4f81e374 // sdot v20.4s, v27.16b, v1.4b[0]\n" + ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" + ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" "bge 42b\n" "44:" // Height 2: Multiply loop: Skip odd blocks "cbz x25, 48f\n" @@ -584,209 +583,209 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x4f80e310 // sdot v16.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e314 // sdot v20.4s, v24.16b, v1.4b[0]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x4f80e351 // sdot v17.4s, v26.16b, v0.4b[0]\n" + ".inst 0x4f81e355 // sdot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x4f80e332 // sdot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e336 // sdot v22.4s, v25.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e313 // sdot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e317 // sdot v23.4s, v24.16b, v1.4b[0]\n" "48:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 34b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" + "add x23, x27, x20\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "tbnz %x[flags], #31, 49f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "neg v2.4s, v2.4s\n" + "neg v24.4s, v24.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "49:" // Height 2: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q27, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q26, [x10, #0x20]\n" + "ldr q25, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v27.4s\n" "add x10, x10, #0x40\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v25.4s\n" + "add v20.4s, v20.4s, v28.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v21.4s, v21.4s, v27.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v23.4s, v23.4s, v25.4s\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" "tbz %x[flags], #5, 50f\n" - "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" + "and v24.16b, v16.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v24.4s\n" + "and v30.16b, v17.16b, v0.16b\n" + "and v29.16b, v18.16b, v0.16b\n" + "and v28.16b, v19.16b, v0.16b\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v0.16b\n" + "and v25.16b, v22.16b, v0.16b\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "50:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 59f\n" "tbz x9, #3, 54f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" + "str d20, [x23], #0x8\n" "tbz x9, #2, 52f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" "tbz x9, #1, 51f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" + "st1 { v20.b }[14], [x23]\n" "b 58f\n" "51:" // Height 2: Partial direct writeback: partial_1_12 "tbz x9, #0, 58f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" + "st1 { v20.b }[12], [x23]\n" "b 58f\n" "52:" // Height 2: Partial direct writeback: partial_2_8 "tbz x9, #1, 53f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" + "st1 { v20.b }[10], [x23]\n" "b 58f\n" "53:" // Height 2: Partial direct writeback: partial_1_8 "tbz x9, #0, 58f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" + "st1 { v20.b }[8], [x23]\n" "b 58f\n" "54:" // Height 2: Partial direct writeback: partial_4_0 "tbz x9, #2, 56f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" + "str s20, [x23], #0x4\n" "tbz x9, #1, 55f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" + "st1 { v20.b }[6], [x23]\n" "b 58f\n" "55:" // Height 2: Partial direct writeback: partial_1_4 "tbz x9, #0, 58f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" + "st1 { v20.b }[4], [x23]\n" "b 58f\n" "56:" // Height 2: Partial direct writeback: partial_2_0 "tbz x9, #1, 57f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" + "str h20, [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" + "st1 { v20.b }[2], [x23]\n" "b 58f\n" "57:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b20, [x23, #0x0]\n" "58:" // Height 2: Partial direct writeback: Done "b 60f\n" "59:" // Height 2: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" + "str q20, [x23, #0x0]\n" "60:" // Height 2: Writeback done "subs x9, x9, #0x10\n" "bgt 32b\n" @@ -819,13 +818,13 @@ void a64_hybrid_s8qa_dot_4x16 ( "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 65f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -834,8 +833,8 @@ void a64_hybrid_s8qa_dot_4x16 ( "b 66f\n" "65:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "66:" // Height 3: input setup done "cmp x25, #0x10\n" "blt 71f\n" @@ -857,62 +856,62 @@ void a64_hybrid_s8qa_dot_4x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q29, [x28, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q28, [x28, #0x80]\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q5, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q4, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q3, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q31, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q30, [x28, #0xd0]\n" + ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x28, #0xe0]\n" + ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x28, #0xf0]\n" + ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 68f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" @@ -940,63 +939,63 @@ void a64_hybrid_s8qa_dot_4x16 ( "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q29, [x28, #0x70]\n" ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q28, [x28, #0x80]\n" "add x22, x22, #0x10\n" ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q5, [x28, #0x90]\n" ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q4, [x28, #0xa0]\n" ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q3, [x28, #0xb0]\n" ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q31, [x28, #0xc0]\n" ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q30, [x28, #0xd0]\n" + ".inst 0x4fa0e3b3 // sdot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3b7 // sdot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3bb // sdot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x28, #0xe0]\n" + ".inst 0x4f80eb90 // sdot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb94 // sdot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb98 // sdot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x28, #0xf0]\n" + ".inst 0x4f80e8b1 // sdot v17.4s, v5.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4f81e8b5 // sdot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4f82e8b9 // sdot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x4f82e89a // sdot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x4f80e873 // sdot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x4f81e877 // sdot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x4f82e87b // sdot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x4fa0ebf0 // sdot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x4fa1ebf4 // sdot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x4fa2ebf8 // sdot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x4fa0ebd1 // sdot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x4fa1ebd5 // sdot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x4fa2ebd9 // sdot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x4fa0ebb2 // sdot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebb6 // sdot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebba // sdot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa0eb93 // sdot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb97 // sdot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb9b // sdot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 70f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" @@ -1018,25 +1017,25 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q31, [x28, #0x0]\n" + "ldr q30, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q29, [x28, #0x20]\n" + "ldr q28, [x28, #0x30]\n" + ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" + ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" + ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" + ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" "bge 72b\n" "74:" // Height 3: Multiply loop: Skip odd blocks "cbz x25, 78f\n" @@ -1059,144 +1058,144 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" + "ldr q31, [x28, #0x0]\n" + "ldr q30, [x28, #0x10]\n" + ".inst 0x4f80e3f0 // sdot v16.4s, v31.16b, v0.4b[0]\n" + ".inst 0x4f81e3f4 // sdot v20.4s, v31.16b, v1.4b[0]\n" + "ldr q29, [x28, #0x20]\n" + "ldr q28, [x28, #0x30]\n" + ".inst 0x4f82e3f8 // sdot v24.4s, v31.16b, v2.4b[0]\n" + ".inst 0x4f80e3d1 // sdot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x4f81e3d5 // sdot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x4f82e3d9 // sdot v25.4s, v30.16b, v2.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3b6 // sdot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3ba // sdot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e397 // sdot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e39b // sdot v27.4s, v28.16b, v2.4b[0]\n" "78:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbnz %x[flags], #31, 79f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v28.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" + "neg v28.4s, v28.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v28.4s\n" + "mul v12.4s, v12.4s, v28.4s\n" + "mul v13.4s, v13.4s, v28.4s\n" "79:" // Height 3: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q31, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q30, [x10, #0x20]\n" + "ldr q29, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v28.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v31.4s\n" + "add v18.4s, v18.4s, v30.4s\n" + "add v19.4s, v19.4s, v29.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v21.4s, v21.4s, v31.4s\n" + "add v22.4s, v22.4s, v30.4s\n" + "add v23.4s, v23.4s, v29.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v31.4s\n" + "add v26.4s, v26.4s, v30.4s\n" + "add v27.4s, v27.4s, v29.4s\n" + "sqrdmulh v16.4s, v16.4s, v28.4s\n" + "sqrdmulh v17.4s, v17.4s, v28.4s\n" + "sqrdmulh v18.4s, v18.4s, v28.4s\n" + "sqrdmulh v19.4s, v19.4s, v28.4s\n" + "sqrdmulh v20.4s, v20.4s, v28.4s\n" + "sqrdmulh v21.4s, v21.4s, v28.4s\n" + "sqrdmulh v22.4s, v22.4s, v28.4s\n" + "sqrdmulh v23.4s, v23.4s, v28.4s\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" "tbz %x[flags], #5, 80f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v16.16b, v0.16b\n" + "and v31.16b, v17.16b, v0.16b\n" + "and v30.16b, v18.16b, v0.16b\n" + "and v29.16b, v19.16b, v0.16b\n" + "and v28.16b, v20.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v30.4s\n" + "sqadd v19.4s, v19.4s, v29.4s\n" + "sqadd v20.4s, v20.4s, v28.4s\n" + "and v3.16b, v21.16b, v0.16b\n" + "and v2.16b, v22.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v0.16b\n" + "and v29.16b, v26.16b, v0.16b\n" + "and v28.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v3.4s\n" + "sqadd v22.4s, v22.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "80:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v30.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1204,132 +1203,132 @@ void a64_hybrid_s8qa_dot_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v16.4s, v16.4s, v30.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v30.4s\n" + "add v19.4s, v19.4s, v30.4s\n" + "add v20.4s, v20.4s, v30.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v30.4s\n" + "add v23.4s, v23.4s, v30.4s\n" + "add v24.4s, v24.4s, v30.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v30.4s\n" + "add v27.4s, v27.4s, v30.4s\n" + "smin v16.4s, v16.4s, v29.4s\n" + "smin v17.4s, v17.4s, v29.4s\n" + "smin v18.4s, v18.4s, v29.4s\n" + "smin v19.4s, v19.4s, v29.4s\n" + "smin v20.4s, v20.4s, v29.4s\n" + "smin v21.4s, v21.4s, v29.4s\n" + "smin v22.4s, v22.4s, v29.4s\n" + "smin v23.4s, v23.4s, v29.4s\n" + "smin v24.4s, v24.4s, v29.4s\n" + "smin v25.4s, v25.4s, v29.4s\n" + "smin v26.4s, v26.4s, v29.4s\n" + "smin v27.4s, v27.4s, v29.4s\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 89f\n" "tbz x9, #3, 84f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x9, #2, 82f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x9, #1, 81f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 88f\n" "81:" // Height 3: Partial direct writeback: partial_1_12 "tbz x9, #0, 88f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 88f\n" "82:" // Height 3: Partial direct writeback: partial_2_8 "tbz x9, #1, 83f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 88f\n" "83:" // Height 3: Partial direct writeback: partial_1_8 "tbz x9, #0, 88f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 88f\n" "84:" // Height 3: Partial direct writeback: partial_4_0 "tbz x9, #2, 86f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x9, #1, 85f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 88f\n" "85:" // Height 3: Partial direct writeback: partial_1_4 "tbz x9, #0, 88f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 88f\n" "86:" // Height 3: Partial direct writeback: partial_2_0 "tbz x9, #1, 87f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 88f\n" "87:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "88:" // Height 3: Partial direct writeback: Done "b 90f\n" "89:" // Height 3: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "90:" // Height 3: Writeback done "subs x9, x9, #0x10\n" "bgt 62b\n" @@ -1370,14 +1369,14 @@ void a64_hybrid_s8qa_dot_4x16 ( "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 95f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 96f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1387,9 +1386,9 @@ void a64_hybrid_s8qa_dot_4x16 ( "b 96f\n" "95:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "96:" // Height 4: input setup done "cmp x25, #0x10\n" "blt 101f\n" @@ -1614,29 +1613,29 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n" - ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n" - ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" "bge 102b\n" "104:" // Height 4: Multiply loop: Skip odd blocks "cbz x25, 108f\n" @@ -1663,73 +1662,73 @@ void a64_hybrid_s8qa_dot_4x16 ( ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n" "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x4f80e0f0 // sdot v16.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n" - ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n" - ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0fc // sdot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n" "108:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 94b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x20, x21, x20\n" + "add x21, x22, x20\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" "prfm pstl1keep, [x21, #0x0]\n" - "prfm pstl1keep, [x20, #0x0]\n" "tbnz %x[flags], #31, 109f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v4.4s, v4.4s\n" + "neg v0.4s, v0.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "109:" // Height 4: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x10, #0x20]\n" + "ldr q2, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" @@ -1740,100 +1739,100 @@ void a64_hybrid_s8qa_dot_4x16 ( "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v2.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v23.4s, v23.4s, v2.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v2.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v31.4s, v31.4s, v2.4s\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" "tbz %x[flags], #5, 110f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v16.16b, v0.16b\n" + "and v1.16b, v17.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v2.4s\n" + "sqadd v17.4s, v17.4s, v1.4s\n" + "and v7.16b, v18.16b, v0.16b\n" + "and v6.16b, v19.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v4.16b, v21.16b, v0.16b\n" + "and v3.16b, v22.16b, v0.16b\n" + "and v2.16b, v23.16b, v0.16b\n" + "and v1.16b, v24.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" - "and v10.16b, v29.16b, v0.16b\n" - "and v4.16b, v30.16b, v0.16b\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v7.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "sqadd v22.4s, v22.4s, v3.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "sqadd v24.4s, v24.4s, v1.4s\n" + "and v7.16b, v25.16b, v0.16b\n" + "and v6.16b, v26.16b, v0.16b\n" + "and v5.16b, v27.16b, v0.16b\n" + "and v4.16b, v28.16b, v0.16b\n" + "and v3.16b, v29.16b, v0.16b\n" + "and v2.16b, v30.16b, v0.16b\n" + "and v1.16b, v31.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sqadd v28.4s, v28.4s, v9.4s\n" - "sqadd v29.4s, v29.4s, v10.4s\n" - "sqadd v30.4s, v30.4s, v4.4s\n" - "sqadd v31.4s, v31.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "sqadd v29.4s, v29.4s, v3.4s\n" + "sqadd v30.4s, v30.4s, v2.4s\n" + "sqadd v31.4s, v31.4s, v1.4s\n" "110:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v3.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v2.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v1.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1845,163 +1844,163 @@ void a64_hybrid_s8qa_dot_4x16 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + "smin v29.4s, v29.4s, v2.4s\n" + "smin v30.4s, v30.4s, v2.4s\n" + "smin v31.4s, v31.4s, v2.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + "smax v29.4s, v29.4s, v1.4s\n" + "smax v30.4s, v30.4s, v1.4s\n" + "smax v31.4s, v31.4s, v1.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 119f\n" "tbz x9, #3, 114f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" "tbz x9, #2, 112f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" "tbz x9, #1, 111f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 118f\n" "111:" // Height 4: Partial direct writeback: partial_1_12 "tbz x9, #0, 118f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 118f\n" "112:" // Height 4: Partial direct writeback: partial_2_8 "tbz x9, #1, 113f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 118f\n" "113:" // Height 4: Partial direct writeback: partial_1_8 "tbz x9, #0, 118f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 118f\n" "114:" // Height 4: Partial direct writeback: partial_4_0 "tbz x9, #2, 116f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" "tbz x9, #1, 115f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 118f\n" "115:" // Height 4: Partial direct writeback: partial_1_4 "tbz x9, #0, 118f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 118f\n" "116:" // Height 4: Partial direct writeback: partial_2_0 "tbz x9, #1, 117f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 118f\n" "117:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "118:" // Height 4: Partial direct writeback: Done "b 120f\n" "119:" // Height 4: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "120:" // Height 4: Writeback done "subs x9, x9, #0x10\n" "bgt 92b\n" @@ -2017,7 +2016,6 @@ void a64_hybrid_s8qa_dot_4x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "122:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp index 69ea87bc9e..55ea68d1b5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -98,5 +98,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp index 69d01a265e..883bd5afdd 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp @@ -78,7 +78,6 @@ void a64_hybrid_s8qa_mmla_4x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 97f\n" @@ -106,11 +105,11 @@ void a64_hybrid_s8qa_mmla_4x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -131,35 +130,35 @@ void a64_hybrid_s8qa_mmla_4x16 ( "ldr q4, [x28, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v27.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q25, [x28, #0x70]\n" + "trn2 v1.2d, v1.2d, v27.2d\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" + ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" "add x28, x28, #0x100\n" - ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" - ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" + ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" + ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" + ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" + ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" @@ -177,36 +176,36 @@ void a64_hybrid_s8qa_mmla_4x16 ( "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v24.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q25, [x28, #0x70]\n" + "trn2 v1.2d, v1.2d, v24.2d\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" "sub x25, x25, #0x10\n" - ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" - ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" + ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" "add x24, x24, #0x10\n" "add x28, x28, #0x100\n" - ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" - ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" + ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" + ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" + ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" + ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" @@ -217,29 +216,29 @@ void a64_hybrid_s8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x24], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d25, [x24], #0x8\n" + "trn1 v0.2d, v25.2d, v24.2d\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" "sub x25, x25, #0x8\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" "cmp x25, #0x8\n" - ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" - "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" - ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" - ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n" + "ldr q27, [x28, #0x40]\n" + "ldr q26, [x28, #0x50]\n" + ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" + ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" + ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks @@ -264,26 +263,26 @@ void a64_hybrid_s8qa_mmla_4x16 ( "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "18:" // Height 1: Multiply loop: Ragged operand read: Done - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v24.2d\n" "tbnz %x[flags], #31, 19f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "19:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" - ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" - ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" - ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" - ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" - ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + "ldr q25, [x28, #0x0]\n" + "ldr q24, [x28, #0x10]\n" + ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x40]\n" + "ldr q24, [x28, #0x50]\n" + ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "20:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -297,75 +296,75 @@ void a64_hybrid_s8qa_mmla_4x16 ( "uzp1 v19.2d, v19.2d, v23.2d\n" "mov v23.16b, v16.16b\n" "tbnz %x[flags], #31, 21f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v16.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v1.4s, v1.4s\n" + "neg v16.4s, v16.4s\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v16.4s\n" "21:" // Height 1: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q24, [x10, #0x0]\n" + "ldr q22, [x10, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q21, [x10, #0x20]\n" + "ldr q20, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v16.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v19.4s, v19.4s, v20.4s\n" + "sqrdmulh v23.4s, v23.4s, v16.4s\n" "add x10, x10, #0x40\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v16.4s\n" + "sqrdmulh v18.4s, v18.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v16.4s\n" "tbz %x[flags], #5, 22f\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v22.16b, v23.16b, v0.16b\n" + "and v21.16b, v17.16b, v0.16b\n" + "and v20.16b, v18.16b, v0.16b\n" + "and v16.16b, v19.16b, v0.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v22.4s\n" + "sqadd v17.4s, v17.4s, v21.4s\n" + "sqadd v18.4s, v18.4s, v20.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" "22:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v21.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v20.4s }, [x20]\n" + "add v23.4s, v23.4s, v21.4s\n" + "add v17.4s, v17.4s, v21.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v21.4s\n" "cmp x9, #0x10\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "smin v23.4s, v23.4s, v20.4s\n" + "smin v17.4s, v17.4s, v20.4s\n" + "smin v18.4s, v18.4s, v20.4s\n" + "smin v19.4s, v19.4s, v20.4s\n" + "smax v23.4s, v23.4s, v16.4s\n" + "smax v17.4s, v17.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smax v19.4s, v19.4s, v16.4s\n" "uzp1 v23.8h, v23.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v17.16b\n" + "uzp1 v16.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v16.16b\n" "bge 31f\n" "tbz x9, #3, 26f\n" "str d23, [x27], #0x8\n" @@ -442,12 +441,12 @@ void a64_hybrid_s8qa_mmla_4x16 ( "36:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 37f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 38f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -455,7 +454,7 @@ void a64_hybrid_s8qa_mmla_4x16 ( "b 38f\n" "37:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "38:" // Height 2: input setup done "cmp x25, #0x10\n" "blt 43f\n" @@ -473,34 +472,34 @@ void a64_hybrid_s8qa_mmla_4x16 ( "39:" // Height 2: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" + ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" "add x23, x23, #0x10\n" - ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" + ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" "add x28, x28, #0x100\n" - ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" + ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" + ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" + ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" @@ -522,35 +521,35 @@ void a64_hybrid_s8qa_mmla_4x16 ( "41:" // Height 2: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x4e88a415 // smmla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x4e89a412 // smmla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x4e99a417 // smmla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x4e98a430 // smmla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" "sub x25, x25, #0x10\n" - ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" - ".inst 0x4e88a431 // smmla v17.4s, v1.16b, v8.16b\n" + ".inst 0x4e9ea434 // smmla v20.4s, v1.16b, v30.16b\n" + ".inst 0x4e9da431 // smmla v17.4s, v1.16b, v29.16b\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x4e89a435 // smmla v21.4s, v1.16b, v9.16b\n" - ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" + ".inst 0x4e9ca435 // smmla v21.4s, v1.16b, v28.16b\n" + ".inst 0x4e9ba432 // smmla v18.4s, v1.16b, v27.16b\n" "add x28, x28, #0x100\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa436 // smmla v22.4s, v1.16b, v26.16b\n" + ".inst 0x4e99a433 // smmla v19.4s, v1.16b, v25.16b\n" + ".inst 0x4e98a437 // smmla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 42f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f942b // sdot v11.4s, v1.16b, v15.16b\n" @@ -562,30 +561,30 @@ void a64_hybrid_s8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 46f\n" "44:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "trn1 v0.2d, v25.2d, v24.2d\n" "tbnz %x[flags], #31, 45f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x4e98a410 // smmla v16.4s, v0.16b, v24.16b\n" "sub x25, x25, #0x8\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" "cmp x25, #0x8\n" - ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" - "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" - ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" - ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" - ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" + ".inst 0x4e9aa414 // smmla v20.4s, v0.16b, v26.16b\n" + "ldr q27, [x28, #0x40]\n" + "ldr q26, [x28, #0x50]\n" + ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x4e9ba412 // smmla v18.4s, v0.16b, v27.16b\n" + ".inst 0x4e9aa416 // smmla v22.4s, v0.16b, v26.16b\n" + ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "bge 44b\n" "46:" // Height 2: Multiply loop: Skip odd blocks @@ -621,22 +620,22 @@ void a64_hybrid_s8qa_mmla_4x16 ( "tbnz %x[flags], #31, 51f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" "51:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" - ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" - ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" - ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" - ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" - ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" + "ldr q25, [x28, #0x0]\n" + "ldr q24, [x28, #0x10]\n" + ".inst 0x4e99a410 // smmla v16.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a414 // smmla v20.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x4e99a411 // smmla v17.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a415 // smmla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x40]\n" + "ldr q24, [x28, #0x50]\n" + ".inst 0x4e99a412 // smmla v18.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a416 // smmla v22.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x4e99a413 // smmla v19.4s, v0.16b, v25.16b\n" + ".inst 0x4e98a417 // smmla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "52:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -644,127 +643,127 @@ void a64_hybrid_s8qa_mmla_4x16 ( "cmp x26, x20\n" "bne 36b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" + "uzp1 v24.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "mov v23.16b, v4.16b\n" + "mov v23.16b, v24.16b\n" "tbnz %x[flags], #31, 53f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v2.4s, v2.4s\n" + "neg v24.4s, v24.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "53:" // Height 2: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q27, [x10, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q26, [x10, #0x20]\n" + "ldr q25, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v20.4s, v20.4s, v27.4s\n" "add x10, x10, #0x40\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v17.4s, v17.4s, v27.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v25.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" "tbz %x[flags], #5, 54f\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v24.4s\n" + "and v30.16b, v20.16b, v0.16b\n" + "and v29.16b, v21.16b, v0.16b\n" + "and v28.16b, v22.16b, v0.16b\n" + "and v27.16b, v16.16b, v0.16b\n" + "and v26.16b, v17.16b, v0.16b\n" + "and v25.16b, v18.16b, v0.16b\n" + "and v24.16b, v19.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "sqadd v22.4s, v22.4s, v28.4s\n" + "sqadd v16.4s, v16.4s, v27.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "sqadd v18.4s, v18.4s, v25.4s\n" + "sqadd v19.4s, v19.4s, v24.4s\n" "54:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" "uzp1 v23.8h, v23.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" @@ -774,68 +773,68 @@ void a64_hybrid_s8qa_mmla_4x16 ( "bge 63f\n" "tbz x9, #3, 58f\n" "str d23, [x27], #0x8\n" - "str d16, [x22], #0x8\n" + "str d16, [x23], #0x8\n" "tbz x9, #2, 56f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" "tbz x9, #1, 55f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" + "st1 { v16.b }[14], [x23]\n" "b 62f\n" "55:" // Height 2: Partial direct writeback: partial_1_12 "tbz x9, #0, 62f\n" "st1 { v23.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" + "st1 { v16.b }[12], [x23]\n" "b 62f\n" "56:" // Height 2: Partial direct writeback: partial_2_8 "tbz x9, #1, 57f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" + "st1 { v16.b }[10], [x23]\n" "b 62f\n" "57:" // Height 2: Partial direct writeback: partial_1_8 "tbz x9, #0, 62f\n" "st1 { v23.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" + "st1 { v16.b }[8], [x23]\n" "b 62f\n" "58:" // Height 2: Partial direct writeback: partial_4_0 "tbz x9, #2, 60f\n" "str s23, [x27], #0x4\n" - "str s16, [x22], #0x4\n" + "str s16, [x23], #0x4\n" "tbz x9, #1, 59f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" + "st1 { v16.b }[6], [x23]\n" "b 62f\n" "59:" // Height 2: Partial direct writeback: partial_1_4 "tbz x9, #0, 62f\n" "st1 { v23.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" + "st1 { v16.b }[4], [x23]\n" "b 62f\n" "60:" // Height 2: Partial direct writeback: partial_2_0 "tbz x9, #1, 61f\n" "str h23, [x27], #0x2\n" - "str h16, [x22], #0x2\n" + "str h16, [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" + "st1 { v16.b }[2], [x23]\n" "b 62f\n" "61:" // Height 2: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" + "str b16, [x23, #0x0]\n" "62:" // Height 2: Partial direct writeback: Done "b 64f\n" "63:" // Height 2: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" + "str q16, [x23, #0x0]\n" "64:" // Height 2: Writeback done "subs x9, x9, #0x10\n" "bgt 34b\n" @@ -872,13 +871,13 @@ void a64_hybrid_s8qa_mmla_4x16 ( "68:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 69f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 70f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -887,8 +886,8 @@ void a64_hybrid_s8qa_mmla_4x16 ( "b 70f\n" "69:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "70:" // Height 3: input setup done "cmp x25, #0x10\n" "blt 75f\n" @@ -909,12 +908,12 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q14, [x28, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q5, [x28, #0x60]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q4, [x28, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" "ldr q7, [x28, #0x90]\n" @@ -930,15 +929,15 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e8aa416 // smmla v22.4s, v0.16b, v10.16b\n" ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" "ldr q10, [x28, #0xc0]\n" - ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n" + ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n" "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n" + "ldr q4, [x28, #0xf0]\n" "add x28, x28, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" @@ -948,12 +947,12 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n" ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n" "tbnz %x[flags], #31, 72f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" @@ -981,12 +980,12 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e85a410 // smmla v16.4s, v0.16b, v5.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q14, [x28, #0x70]\n" ".inst 0x4e86a414 // smmla v20.4s, v0.16b, v6.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q5, [x28, #0x60]\n" ".inst 0x4e86a45c // smmla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q4, [x28, #0x80]\n" ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" "ldr q7, [x28, #0x90]\n" @@ -1003,15 +1002,15 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e8aa45e // smmla v30.4s, v2.16b, v10.16b\n" "ldr q10, [x28, #0xc0]\n" "add x22, x22, #0x10\n" - ".inst 0x4e84a413 // smmla v19.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45b // smmla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x4e85a417 // smmla v23.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a45f // smmla v31.4s, v2.16b, v5.16b\n" + ".inst 0x4e85a413 // smmla v19.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45b // smmla v27.4s, v2.16b, v5.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x4e8ea417 // smmla v23.4s, v0.16b, v14.16b\n" + ".inst 0x4e8ea45f // smmla v31.4s, v2.16b, v14.16b\n" "ldr q5, [x28, #0xe0]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a478 // smmla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a478 // smmla v24.4s, v3.16b, v4.16b\n" + "ldr q4, [x28, #0xf0]\n" "add x28, x28, #0x100\n" ".inst 0x4e87a434 // smmla v20.4s, v1.16b, v7.16b\n" ".inst 0x4e87a47c // smmla v28.4s, v3.16b, v7.16b\n" @@ -1021,12 +1020,12 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e89a47d // smmla v29.4s, v3.16b, v9.16b\n" ".inst 0x4e8aa432 // smmla v18.4s, v1.16b, v10.16b\n" ".inst 0x4e8aa47a // smmla v26.4s, v3.16b, v10.16b\n" - ".inst 0x4e84a436 // smmla v22.4s, v1.16b, v4.16b\n" - ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n" + ".inst 0x4e86a436 // smmla v22.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n" ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" - ".inst 0x4e86a437 // smmla v23.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a47f // smmla v31.4s, v3.16b, v6.16b\n" + ".inst 0x4e84a437 // smmla v23.4s, v1.16b, v4.16b\n" + ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n" "tbnz %x[flags], #31, 74f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" @@ -1042,41 +1041,41 @@ void a64_hybrid_s8qa_mmla_4x16 ( "blt 78f\n" "76:" // Height 3: Multiply loop: Odd block loop "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x22], #0x8\n" - "trn1 v2.2d, v3.2d, v7.2d\n" + "ldr d0, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v0.2d\n" + "ldr d1, [x22], #0x8\n" + "trn1 v2.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 77f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" + ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" "sub x25, x25, #0x8\n" "cmp x25, #0x8\n" "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" - ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" - ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" "add x28, x28, #0x80\n" ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" - ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" + ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" + ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" "bge 76b\n" "78:" // Height 3: Multiply loop: Skip odd blocks "cbz x25, 84f\n" @@ -1115,52 +1114,52 @@ void a64_hybrid_s8qa_mmla_4x16 ( "ldr b3, [x22, #0x0]\n" "82:" // Height 3: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v9.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" "tbnz %x[flags], #31, 83f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "83:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" - "ldr q5, [x28, #0x20]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q3, [x28, #0x10]\n" + ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x28, #0x20]\n" "ldr q6, [x28, #0x30]\n" - ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" + ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n" + "ldr q5, [x28, #0x40]\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" "add x28, x28, #0x80\n" - ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" - ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" - ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" - ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" + ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" + ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" "84:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 68b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "uzp1 v0.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" - "prfm pstl1keep, [x21, #0x0]\n" + "prfm pstl1keep, [x22, #0x0]\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" @@ -1168,116 +1167,116 @@ void a64_hybrid_s8qa_mmla_4x16 ( "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v4.16b\n" + "mov v31.16b, v0.16b\n" "tbnz %x[flags], #31, 85f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v23.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" + "neg v23.4s, v23.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v23.4s\n" + "mul v12.4s, v12.4s, v23.4s\n" + "mul v13.4s, v13.4s, v23.4s\n" "85:" // Height 3: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q30, [x10, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q29, [x10, #0x20]\n" + "ldr q28, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v23.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" + "add v20.4s, v20.4s, v30.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v28.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "sqrdmulh v31.4s, v31.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v21.4s, v21.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sqrdmulh v16.4s, v16.4s, v23.4s\n" + "sqrdmulh v17.4s, v17.4s, v23.4s\n" + "sqrdmulh v18.4s, v18.4s, v23.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqrdmulh v24.4s, v24.4s, v23.4s\n" + "sqrdmulh v25.4s, v25.4s, v23.4s\n" + "sqrdmulh v26.4s, v26.4s, v23.4s\n" + "sqrdmulh v27.4s, v27.4s, v23.4s\n" "tbz %x[flags], #5, 86f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v31.16b, v0.16b\n" + "and v30.16b, v20.16b, v0.16b\n" + "and v29.16b, v21.16b, v0.16b\n" + "and v28.16b, v22.16b, v0.16b\n" + "and v23.16b, v16.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v1.4s\n" + "sqadd v20.4s, v20.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "sqadd v22.4s, v22.4s, v28.4s\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "and v3.16b, v17.16b, v0.16b\n" + "and v2.16b, v18.16b, v0.16b\n" + "and v1.16b, v19.16b, v0.16b\n" + "and v30.16b, v24.16b, v0.16b\n" + "and v29.16b, v25.16b, v0.16b\n" + "and v28.16b, v26.16b, v0.16b\n" + "and v23.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v18.4s, v18.4s, v2.4s\n" + "sqadd v19.4s, v19.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v30.4s\n" + "sqadd v25.4s, v25.4s, v29.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "sqadd v27.4s, v27.4s, v23.4s\n" "86:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v23.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1285,132 +1284,132 @@ void a64_hybrid_s8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v29.4s\n" + "add v20.4s, v20.4s, v29.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v16.4s, v16.4s, v29.4s\n" + "add v17.4s, v17.4s, v29.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v29.4s\n" + "smin v31.4s, v31.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "smax v31.4s, v31.4s, v23.4s\n" + "smax v20.4s, v20.4s, v23.4s\n" + "smax v21.4s, v21.4s, v23.4s\n" + "smax v22.4s, v22.4s, v23.4s\n" + "smax v16.4s, v16.4s, v23.4s\n" + "smax v17.4s, v17.4s, v23.4s\n" + "smax v18.4s, v18.4s, v23.4s\n" + "smax v19.4s, v19.4s, v23.4s\n" + "smax v24.4s, v24.4s, v23.4s\n" + "smax v25.4s, v25.4s, v23.4s\n" + "smax v26.4s, v26.4s, v23.4s\n" + "smax v27.4s, v27.4s, v23.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 95f\n" "tbz x9, #3, 90f\n" "str d31, [x27], #0x8\n" - "str d16, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d16, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x9, #2, 88f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x9, #1, 87f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 94f\n" "87:" // Height 3: Partial direct writeback: partial_1_12 "tbz x9, #0, 94f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 94f\n" "88:" // Height 3: Partial direct writeback: partial_2_8 "tbz x9, #1, 89f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 94f\n" "89:" // Height 3: Partial direct writeback: partial_1_8 "tbz x9, #0, 94f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 94f\n" "90:" // Height 3: Partial direct writeback: partial_4_0 "tbz x9, #2, 92f\n" "str s31, [x27], #0x4\n" - "str s16, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s16, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x9, #1, 91f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 94f\n" "91:" // Height 3: Partial direct writeback: partial_1_4 "tbz x9, #0, 94f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 94f\n" "92:" // Height 3: Partial direct writeback: partial_2_0 "tbz x9, #1, 93f\n" "str h31, [x27], #0x2\n" - "str h16, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h16, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 94f\n" "93:" // Height 3: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "94:" // Height 3: Partial direct writeback: Done "b 96f\n" "95:" // Height 3: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "96:" // Height 3: Writeback done "subs x9, x9, #0x10\n" "bgt 66b\n" @@ -1451,14 +1450,14 @@ void a64_hybrid_s8qa_mmla_4x16 ( "100:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 101f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 102f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1468,9 +1467,9 @@ void a64_hybrid_s8qa_mmla_4x16 ( "b 102f\n" "101:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "102:" // Height 4: input setup done "cmp x25, #0x10\n" "blt 107f\n" @@ -1630,42 +1629,42 @@ void a64_hybrid_s8qa_mmla_4x16 ( "blt 110f\n" "108:" // Height 4: Multiply loop: Odd block loop "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v2.2d, v3.2d, v7.2d\n" + "ldr d0, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v0.2d\n" + "ldr d2, [x22], #0x8\n" + "ldr d1, [x21], #0x8\n" + "trn1 v2.2d, v2.2d, v1.2d\n" "tbnz %x[flags], #31, 109f\n" ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "109:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x4e88a410 // smmla v16.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a458 // smmla v24.4s, v2.16b, v8.16b\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" + ".inst 0x4e83a410 // smmla v16.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" "sub x25, x25, #0x8\n" "cmp x25, #0x8\n" "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x4e89a414 // smmla v20.4s, v0.16b, v9.16b\n" - ".inst 0x4e89a45c // smmla v28.4s, v2.16b, v9.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x4e8aa411 // smmla v17.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa459 // smmla v25.4s, v2.16b, v10.16b\n" - ".inst 0x4e84a415 // smmla v21.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45d // smmla v29.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x4e81a414 // smmla v20.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x4e87a411 // smmla v17.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" + ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" "add x28, x28, #0x80\n" ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" - ".inst 0x4e86a416 // smmla v22.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a45e // smmla v30.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a413 // smmla v19.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a45b // smmla v27.4s, v2.16b, v7.16b\n" - ".inst 0x4e88a417 // smmla v23.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45f // smmla v31.4s, v2.16b, v8.16b\n" + ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" + ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" + ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" "bge 108b\n" "110:" // Height 4: Multiply loop: Skip odd blocks "cbz x25, 116f\n" @@ -1716,51 +1715,51 @@ void a64_hybrid_s8qa_mmla_4x16 ( ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n" ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n" "115:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x4e8aa410 // smmla v16.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa458 // smmla v24.4s, v2.16b, v10.16b\n" - "ldr q5, [x28, #0x20]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q3, [x28, #0x10]\n" + ".inst 0x4e81a410 // smmla v16.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x28, #0x20]\n" "ldr q6, [x28, #0x30]\n" - ".inst 0x4e84a414 // smmla v20.4s, v0.16b, v4.16b\n" - ".inst 0x4e84a45c // smmla v28.4s, v2.16b, v4.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x4e85a411 // smmla v17.4s, v0.16b, v5.16b\n" - ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" + ".inst 0x4e83a414 // smmla v20.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45c // smmla v28.4s, v2.16b, v3.16b\n" + "ldr q5, [x28, #0x40]\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" ".inst 0x4e86a415 // smmla v21.4s, v0.16b, v6.16b\n" ".inst 0x4e86a45d // smmla v29.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a412 // smmla v18.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a45a // smmla v26.4s, v2.16b, v7.16b\n" + ".inst 0x4e85a412 // smmla v18.4s, v0.16b, v5.16b\n" + ".inst 0x4e85a45a // smmla v26.4s, v2.16b, v5.16b\n" "add x28, x28, #0x80\n" - ".inst 0x4e88a416 // smmla v22.4s, v0.16b, v8.16b\n" - ".inst 0x4e88a45e // smmla v30.4s, v2.16b, v8.16b\n" - ".inst 0x4e89a413 // smmla v19.4s, v0.16b, v9.16b\n" - ".inst 0x4e89a45b // smmla v27.4s, v2.16b, v9.16b\n" - ".inst 0x4e8aa417 // smmla v23.4s, v0.16b, v10.16b\n" - ".inst 0x4e8aa45f // smmla v31.4s, v2.16b, v10.16b\n" + ".inst 0x4e84a416 // smmla v22.4s, v0.16b, v4.16b\n" + ".inst 0x4e84a45e // smmla v30.4s, v2.16b, v4.16b\n" + ".inst 0x4e83a413 // smmla v19.4s, v0.16b, v3.16b\n" + ".inst 0x4e83a45b // smmla v27.4s, v2.16b, v3.16b\n" + ".inst 0x4e81a417 // smmla v23.4s, v0.16b, v1.16b\n" + ".inst 0x4e81a45f // smmla v31.4s, v2.16b, v1.16b\n" "116:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 100b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" + "uzp1 v0.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "add x21, x22, x20\n" - "add x20, x21, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x20, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" "uzp2 v19.2d, v19.2d, v23.2d\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" @@ -1770,38 +1769,38 @@ void a64_hybrid_s8qa_mmla_4x16 ( "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v4.16b\n" + "mov v31.16b, v0.16b\n" "tbnz %x[flags], #31, 117f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v4.4s, v4.4s\n" + "neg v0.4s, v0.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v14.4s, v13.s[3]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "117:" // Height 4: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x10, #0x20]\n" + "ldr q2, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v23.4s, v23.4s, v13.4s\n" "add v28.4s, v28.4s, v13.4s\n" "add x10, x10, #0x40\n" @@ -1812,100 +1811,100 @@ void a64_hybrid_s8qa_mmla_4x16 ( "add v26.4s, v26.4s, v14.4s\n" "add v27.4s, v27.4s, v14.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v2.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v2.4s\n" "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v1.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v3.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v2.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v2.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" "tbz %x[flags], #5, 118f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "and v5.16b, v23.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v31.16b, v0.16b\n" + "and v1.16b, v20.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v2.4s\n" + "sqadd v20.4s, v20.4s, v1.4s\n" + "and v7.16b, v21.16b, v0.16b\n" + "and v6.16b, v22.16b, v0.16b\n" + "and v5.16b, v16.16b, v0.16b\n" + "and v4.16b, v17.16b, v0.16b\n" + "and v3.16b, v18.16b, v0.16b\n" + "and v2.16b, v19.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "sqadd v23.4s, v23.4s, v5.4s\n" - "and v6.16b, v28.16b, v0.16b\n" - "and v7.16b, v29.16b, v0.16b\n" - "and v8.16b, v30.16b, v0.16b\n" - "and v9.16b, v24.16b, v0.16b\n" - "and v10.16b, v25.16b, v0.16b\n" - "and v4.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v7.4s\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "sqadd v16.4s, v16.4s, v5.4s\n" + "sqadd v17.4s, v17.4s, v4.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v19.4s, v19.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "and v7.16b, v28.16b, v0.16b\n" + "and v6.16b, v29.16b, v0.16b\n" + "and v5.16b, v30.16b, v0.16b\n" + "and v4.16b, v24.16b, v0.16b\n" + "and v3.16b, v25.16b, v0.16b\n" + "and v2.16b, v26.16b, v0.16b\n" + "and v1.16b, v27.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v6.4s\n" - "sqadd v29.4s, v29.4s, v7.4s\n" - "sqadd v30.4s, v30.4s, v8.4s\n" - "sqadd v24.4s, v24.4s, v9.4s\n" - "sqadd v25.4s, v25.4s, v10.4s\n" - "sqadd v26.4s, v26.4s, v4.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v7.4s\n" + "sqadd v29.4s, v29.4s, v6.4s\n" + "sqadd v30.4s, v30.4s, v5.4s\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "sqadd v25.4s, v25.4s, v3.4s\n" + "sqadd v26.4s, v26.4s, v2.4s\n" + "sqadd v27.4s, v27.4s, v1.4s\n" "118:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v3.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v2.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v1.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1917,163 +1916,163 @@ void a64_hybrid_s8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "smin v31.4s, v31.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + "smin v29.4s, v29.4s, v2.4s\n" + "smin v30.4s, v30.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smax v31.4s, v31.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + "smax v29.4s, v29.4s, v1.4s\n" + "smax v30.4s, v30.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v18.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v23.16b, v23.16b, v28.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v23.16b, v23.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 127f\n" "tbz x9, #3, 122f\n" "str d31, [x27], #0x8\n" - "str d16, [x22], #0x8\n" - "str d23, [x21], #0x8\n" - "str d24, [x20], #0x8\n" + "str d16, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d24, [x21], #0x8\n" "tbz x9, #2, 120f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v23.s }[2], [x21], #0x4\n" - "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "st1 { v24.s }[2], [x21], #0x4\n" "tbz x9, #1, 119f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v23.h }[6], [x21], #0x2\n" - "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v23.h }[6], [x22], #0x2\n" + "st1 { v24.h }[6], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v23.b }[14], [x21]\n" - "st1 { v24.b }[14], [x20]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v23.b }[14], [x22]\n" + "st1 { v24.b }[14], [x21]\n" "b 126f\n" "119:" // Height 4: Partial direct writeback: partial_1_12 "tbz x9, #0, 126f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v23.b }[12], [x21]\n" - "st1 { v24.b }[12], [x20]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v23.b }[12], [x22]\n" + "st1 { v24.b }[12], [x21]\n" "b 126f\n" "120:" // Height 4: Partial direct writeback: partial_2_8 "tbz x9, #1, 121f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v23.h }[4], [x21], #0x2\n" - "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v23.h }[4], [x22], #0x2\n" + "st1 { v24.h }[4], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v23.b }[10], [x21]\n" - "st1 { v24.b }[10], [x20]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v23.b }[10], [x22]\n" + "st1 { v24.b }[10], [x21]\n" "b 126f\n" "121:" // Height 4: Partial direct writeback: partial_1_8 "tbz x9, #0, 126f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v23.b }[8], [x21]\n" - "st1 { v24.b }[8], [x20]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v23.b }[8], [x22]\n" + "st1 { v24.b }[8], [x21]\n" "b 126f\n" "122:" // Height 4: Partial direct writeback: partial_4_0 "tbz x9, #2, 124f\n" "str s31, [x27], #0x4\n" - "str s16, [x22], #0x4\n" - "str s23, [x21], #0x4\n" - "str s24, [x20], #0x4\n" + "str s16, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "str s24, [x21], #0x4\n" "tbz x9, #1, 123f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v23.h }[2], [x21], #0x2\n" - "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v23.h }[2], [x22], #0x2\n" + "st1 { v24.h }[2], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v23.b }[6], [x21]\n" - "st1 { v24.b }[6], [x20]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v23.b }[6], [x22]\n" + "st1 { v24.b }[6], [x21]\n" "b 126f\n" "123:" // Height 4: Partial direct writeback: partial_1_4 "tbz x9, #0, 126f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v23.b }[4], [x21]\n" - "st1 { v24.b }[4], [x20]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v23.b }[4], [x22]\n" + "st1 { v24.b }[4], [x21]\n" "b 126f\n" "124:" // Height 4: Partial direct writeback: partial_2_0 "tbz x9, #1, 125f\n" "str h31, [x27], #0x2\n" - "str h16, [x22], #0x2\n" - "str h23, [x21], #0x2\n" - "str h24, [x20], #0x2\n" + "str h16, [x23], #0x2\n" + "str h23, [x22], #0x2\n" + "str h24, [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v23.b }[2], [x21]\n" - "st1 { v24.b }[2], [x20]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v23.b }[2], [x22]\n" + "st1 { v24.b }[2], [x21]\n" "b 126f\n" "125:" // Height 4: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b23, [x21, #0x0]\n" - "str b24, [x20, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b23, [x22, #0x0]\n" + "str b24, [x21, #0x0]\n" "126:" // Height 4: Partial direct writeback: Done "b 128f\n" "127:" // Height 4: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" - "str q23, [x21, #0x0]\n" - "str q24, [x20, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q23, [x22, #0x0]\n" + "str q24, [x21, #0x0]\n" "128:" // Height 4: Writeback done "subs x9, x9, #0x10\n" "bgt 98b\n" @@ -2089,7 +2088,6 @@ void a64_hybrid_s8qa_mmla_4x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "130:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp index b028a8a9a3..2b7531d1e2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -108,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp index b97b63cdce..38a57b0741 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp @@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 136f\n" @@ -111,11 +110,11 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" "cbnz x14, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" @@ -132,129 +131,129 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "blt 8f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x15, #0x20]\n" + "ldr d17, [x15, #0x20]\n" "ldr x20, [x15, #0x28]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x15, #0x30]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x38]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x15, #0x40]\n" + "ldr d16, [x15, #0x30]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x38]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr d17, [x15, #0x40]\n" "ldr x20, [x15, #0x48]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x15, #0x50]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x15, #0x60]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr d16, [x15, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x15, #0x60]\n" "ldr x20, [x15, #0x68]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x15, #0x70]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x78]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x15, #0x80]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x15, #0x70]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x78]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x15, #0x80]\n" "ldr x20, [x15, #0x88]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x15, #0x90]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x15, #0xa0]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x15, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x15, #0xa0]\n" "ldr x20, [x15, #0xa8]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0xb8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x15, #0xc0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x15, #0xb0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xb8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x15, #0xc0]\n" "ldr x20, [x15, #0xc8]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x15, #0xd0]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr d6, [x15, #0xe0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x15, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr d17, [x15, #0xe0]\n" "ldr x20, [x15, #0xe8]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0xf8]\n" - "mov v7.d[1], x11\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr d16, [x15, #0xf0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xf8]\n" + "mov v16.d[1], x20\n" "add x12, x12, #0x10\n" "add x15, x15, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "ldr d6, [x15, #0x0]\n" "ldr x20, [x15, #0x8]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" "sub x13, x13, #0x10\n" "ldr d7, [x15, #0x10]\n" "cmp x13, #0x20\n" - "ldr x10, [x12, #0x8]\n" + "ldr x21, [x12, #0x8]\n" "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x18]\n" - "mov v0.d[1], x10\n" - "mov v7.d[1], x11\n" + "ldr x20, [x15, #0x18]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x12, #0x80]\n" "bge 7b\n" "8:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q17, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x15, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x15, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x15, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x15, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x15, #0xf0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x15, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x15, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x15, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x15, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x15, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x15, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x15, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x15, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x15, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x15, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x15, #0xf0]\n" "add x12, x12, #0x10\n" "sub x13, x13, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "add x15, x15, #0x100\n" "9:" // Height 1: Multiply loop: Main loop skip "cbz x13, 14f\n" "cmp x13, #0x4\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" + "ldr s18, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q16, [x15, #0x0]\n" + ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x15, #0x10]\n" + ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" + "ldr q17, [x15, #0x20]\n" "cmp x13, #0x4\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" "add x15, x15, #0x40\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks @@ -267,28 +266,28 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" "13:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x0]\n" + ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x10]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x20]\n" + ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" "14:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 4b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q16, [x6, #0x0]\n" + "add v8.4s, v8.4s, v16.4s\n" + "ldr q16, [x6, #0x10]\n" + "add v9.4s, v9.4s, v16.4s\n" + "ldr q16, [x6, #0x20]\n" + "add v10.4s, v10.4s, v16.4s\n" + "ldr q16, [x6, #0x30]\n" + "add v11.4s, v11.4s, v16.4s\n" "prfm pstl1keep, [x17, #0x0]\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 15f\n" @@ -304,10 +303,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 16f\n" "15:" // Height 1: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -320,45 +319,45 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v10.4s, v10.4s, v6.4s\n" "sqrdmulh v11.4s, v11.4s, v7.4s\n" "tbz %x[flags], #5, 17f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" + "and v19.16b, v8.16b, v0.16b\n" + "and v18.16b, v9.16b, v1.16b\n" + "and v17.16b, v10.16b, v2.16b\n" + "and v16.16b, v11.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v19.4s\n" + "sqadd v9.4s, v9.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v17.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" "17:" // Height 1: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v16.4s }, [x20]\n" + "add v8.4s, v8.4s, v16.4s\n" + "add v9.4s, v9.4s, v16.4s\n" + "add v10.4s, v10.4s, v16.4s\n" + "add v11.4s, v11.4s, v16.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v16.4s }, [x20]\n" + "smin v8.4s, v8.4s, v16.4s\n" + "smin v9.4s, v9.4s, v16.4s\n" + "smin v10.4s, v10.4s, v16.4s\n" + "smin v11.4s, v11.4s, v16.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v16.8h, v10.8h, v11.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v8.16b, v8.16b, v16.16b\n" "bge 26f\n" "tbz x16, #3, 21f\n" "str d8, [x17], #0x8\n" @@ -433,247 +432,247 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "31:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 32f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x11, [x20, #0x8]\n" "cbnz x14, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" - "add x9, x9, x20\n" + "add x11, x11, x20\n" "b 33f\n" "32:" // Height 2: setup direct input "mov x12, %x[input_ptr]\n" - "add x9, x12, x20\n" + "add x11, x12, x21\n" "33:" // Height 2: input setup done "cmp x13, #0x10\n" "blt 36f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" - "ldr q1, [x9, #0x0]\n" + "ldr q1, [x11, #0x0]\n" "ldr q6, [x15, #0x0]\n" "ldr q7, [x15, #0x10]\n" "blt 35f\n" "34:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x20, [x15, #0x28]\n" + "ldr x21, [x15, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x15, #0x20]\n" + "ldr d17, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr x11, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x15, #0x30]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr d16, [x15, #0x30]\n" + "mov v17.d[1], x21\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr d17, [x15, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "ldr x20, [x15, #0x48]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x15, #0x50]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr x20, [x15, #0x68]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr x11, [x15, #0x78]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x15, #0x70]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr d16, [x15, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr x21, [x15, #0x68]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x15, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr x20, [x15, #0x78]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x15, #0x70]\n" + "mov v17.d[1], x21\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x15, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" "ldr x20, [x15, #0x88]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x15, #0x90]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr x20, [x15, #0xa8]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr x11, [x15, #0xb8]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x15, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x15, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr x20, [x15, #0xb8]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x15, #0xb0]\n" + "mov v17.d[1], x21\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x15, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" "ldr x20, [x15, #0xc8]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x15, #0xd0]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr x20, [x15, #0xe8]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr d6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr x11, [x15, #0xf8]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v6.d[1], x20\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x15, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x15, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr d17, [x15, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr d16, [x15, #0xf0]\n" + "mov v17.d[1], x21\n" "add x12, x12, #0x10\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" + "mov v16.d[1], x20\n" + "add x11, x11, #0x10\n" "add x15, x15, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" "ldr d6, [x15, #0x0]\n" - "ldr x20, [x15, #0x8]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr x21, [x15, #0x8]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" + "ldr d1, [x11, #0x0]\n" "sub x13, x13, #0x10\n" "ldr d7, [x15, #0x10]\n" "cmp x13, #0x20\n" - "ldr x10, [x12, #0x8]\n" - "mov v6.d[1], x20\n" - "ldr x28, [x9, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x15, #0x18]\n" - "mov v1.d[1], x28\n" + "ldr x20, [x12, #0x8]\n" + "mov v6.d[1], x21\n" + "ldr x21, [x11, #0x8]\n" + "mov v0.d[1], x20\n" + "ldr x20, [x15, #0x18]\n" + "mov v1.d[1], x21\n" "prfm pldl1keep, [x12, #0x80]\n" - "mov v7.d[1], x11\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v7.d[1], x20\n" + "prfm pldl1keep, [x11, #0x80]\n" "bge 34b\n" "35:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q17, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" "sub x13, x13, #0x10\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x15, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x15, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x15, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x15, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x15, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x15, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x15, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x15, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x15, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x15, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x15, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x15, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x15, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x15, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x15, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x15, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x15, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x15, #0xf0]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "add x15, x15, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "36:" // Height 2: Multiply loop: Main loop skip "cbz x13, 41f\n" "cmp x13, #0x4\n" "blt 38f\n" "37:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" + "ldr s19, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s18, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x15, #0x0]\n" + ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" + "ldr q16, [x15, #0x10]\n" + ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x15, #0x20]\n" + ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" "bge 37b\n" "38:" // Height 2: Multiply loop: Skip odd blocks "cbz x13, 41f\n" "tbz x13, #1, 39f\n" "ldr h0, [x12], #0x2\n" - "ldr h1, [x9], #0x2\n" + "ldr h1, [x11], #0x2\n" "tbz x13, #0, 40f\n" "ld1 { v0.b }[2], [x12]\n" - "ld1 { v1.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x11]\n" "b 40f\n" "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" - "ldr b1, [x9, #0x0]\n" + "ldr b1, [x11, #0x0]\n" "40:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x15, #0x0]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + "ldr q16, [x15, #0x10]\n" + ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x15, #0x20]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x15, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" "41:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 31b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q19, [x6, #0x0]\n" + "add v8.4s, v8.4s, v19.4s\n" + "ldr q18, [x6, #0x10]\n" + "add v9.4s, v9.4s, v18.4s\n" + "ldr q17, [x6, #0x20]\n" + "add v10.4s, v10.4s, v17.4s\n" + "ldr q16, [x6, #0x30]\n" + "add v11.4s, v11.4s, v16.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x17, x20\n" + "add x25, x17, x20\n" "prfm pstl1keep, [x17, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" + "add v12.4s, v12.4s, v19.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v13.4s, v13.4s, v18.4s\n" + "add v14.4s, v14.4s, v17.4s\n" + "add v15.4s, v15.4s, v16.4s\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 42f\n" "ldr q0, [x8, #0x0]\n" @@ -688,10 +687,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 43f\n" "42:" // Height 2: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -708,30 +707,30 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v14.4s, v14.4s, v6.4s\n" "sqrdmulh v15.4s, v15.4s, v7.4s\n" "tbz %x[flags], #5, 44f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" + "and v19.16b, v8.16b, v0.16b\n" + "and v18.16b, v9.16b, v1.16b\n" + "and v17.16b, v10.16b, v2.16b\n" + "and v16.16b, v11.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v19.4s\n" + "sqadd v9.4s, v9.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v17.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "and v19.16b, v12.16b, v0.16b\n" + "and v18.16b, v13.16b, v1.16b\n" + "and v17.16b, v14.16b, v2.16b\n" + "and v16.16b, v15.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v19.4s\n" + "sqadd v13.4s, v13.4s, v18.4s\n" + "sqadd v14.4s, v14.4s, v17.4s\n" + "sqadd v15.4s, v15.4s, v16.4s\n" "44:" // Height 2: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" @@ -741,108 +740,108 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v13.4s, v13.4s, v1.4s\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v16.4s }, [x20]\n" + "add v8.4s, v8.4s, v16.4s\n" + "add v9.4s, v9.4s, v16.4s\n" + "add v10.4s, v10.4s, v16.4s\n" + "add v11.4s, v11.4s, v16.4s\n" + "add v12.4s, v12.4s, v16.4s\n" + "add v13.4s, v13.4s, v16.4s\n" + "add v14.4s, v14.4s, v16.4s\n" + "add v15.4s, v15.4s, v16.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v16.4s }, [x20]\n" + "smin v8.4s, v8.4s, v16.4s\n" + "smin v9.4s, v9.4s, v16.4s\n" + "smin v10.4s, v10.4s, v16.4s\n" + "smin v11.4s, v11.4s, v16.4s\n" + "smin v12.4s, v12.4s, v16.4s\n" + "smin v13.4s, v13.4s, v16.4s\n" + "smin v14.4s, v14.4s, v16.4s\n" + "smin v15.4s, v15.4s, v16.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smax v12.4s, v12.4s, v16.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v14.4s, v14.4s, v16.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v17.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v16.8h, v14.8h, v15.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v8.16b, v8.16b, v17.16b\n" + "uzp1 v12.16b, v12.16b, v16.16b\n" "bge 53f\n" "tbz x16, #3, 48f\n" "str d8, [x17], #0x8\n" - "str d12, [x24], #0x8\n" + "str d12, [x25], #0x8\n" "tbz x16, #2, 46f\n" "st1 { v8.s }[2], [x17], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" "tbz x16, #1, 45f\n" "st1 { v8.h }[6], [x17], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" "tbz x16, #0, 52f\n" "st1 { v8.b }[14], [x17]\n" - "st1 { v12.b }[14], [x24]\n" + "st1 { v12.b }[14], [x25]\n" "b 52f\n" "45:" // Height 2: Partial direct writeback: partial_1_12 "tbz x16, #0, 52f\n" "st1 { v8.b }[12], [x17]\n" - "st1 { v12.b }[12], [x24]\n" + "st1 { v12.b }[12], [x25]\n" "b 52f\n" "46:" // Height 2: Partial direct writeback: partial_2_8 "tbz x16, #1, 47f\n" "st1 { v8.h }[4], [x17], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" "tbz x16, #0, 52f\n" "st1 { v8.b }[10], [x17]\n" - "st1 { v12.b }[10], [x24]\n" + "st1 { v12.b }[10], [x25]\n" "b 52f\n" "47:" // Height 2: Partial direct writeback: partial_1_8 "tbz x16, #0, 52f\n" "st1 { v8.b }[8], [x17]\n" - "st1 { v12.b }[8], [x24]\n" + "st1 { v12.b }[8], [x25]\n" "b 52f\n" "48:" // Height 2: Partial direct writeback: partial_4_0 "tbz x16, #2, 50f\n" "str s8, [x17], #0x4\n" - "str s12, [x24], #0x4\n" + "str s12, [x25], #0x4\n" "tbz x16, #1, 49f\n" "st1 { v8.h }[2], [x17], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" "tbz x16, #0, 52f\n" "st1 { v8.b }[6], [x17]\n" - "st1 { v12.b }[6], [x24]\n" + "st1 { v12.b }[6], [x25]\n" "b 52f\n" "49:" // Height 2: Partial direct writeback: partial_1_4 "tbz x16, #0, 52f\n" "st1 { v8.b }[4], [x17]\n" - "st1 { v12.b }[4], [x24]\n" + "st1 { v12.b }[4], [x25]\n" "b 52f\n" "50:" // Height 2: Partial direct writeback: partial_2_0 "tbz x16, #1, 51f\n" "str h8, [x17], #0x2\n" - "str h12, [x24], #0x2\n" + "str h12, [x25], #0x2\n" "tbz x16, #0, 52f\n" "st1 { v8.b }[2], [x17]\n" - "st1 { v12.b }[2], [x24]\n" + "st1 { v12.b }[2], [x25]\n" "b 52f\n" "51:" // Height 2: Partial direct writeback: partial_1_0 "str b8, [x17, #0x0]\n" - "str b12, [x24, #0x0]\n" + "str b12, [x25, #0x0]\n" "52:" // Height 2: Partial direct writeback: Done "b 54f\n" "53:" // Height 2: Full writeback "str q8, [x17, #0x0]\n" "add x17, x17, #0x10\n" - "str q12, [x24, #0x0]\n" + "str q12, [x25, #0x0]\n" "54:" // Height 2: Writeback done "subs x16, x16, #0x10\n" "bgt 29b\n" @@ -872,308 +871,308 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "58:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x11, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" "cbnz x14, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "b 60f\n" "59:" // Height 3: setup direct input "mov x12, %x[input_ptr]\n" - "add x9, x12, x20\n" - "add x27, x9, x20\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" "60:" // Height 3: input setup done "cmp x13, #0x10\n" "blt 63f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" + "ldr q1, [x11, #0x0]\n" + "ldr q2, [x10, #0x0]\n" "ldr q6, [x15, #0x0]\n" "ldr q7, [x15, #0x10]\n" "blt 62f\n" "61:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x20, [x15, #0x28]\n" + "ldr x21, [x15, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x15, #0x20]\n" + "ldr d21, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" + "mov v21.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x48]\n" + "ldr x21, [x15, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x15, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x15, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" + "ldr d20, [x15, #0x30]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + "ldr x20, [x15, #0x58]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr d21, [x15, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + "ldr x21, [x15, #0x68]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr d20, [x15, #0x50]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x78]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x15, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x15, #0x88]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x15, #0x70]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x98]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x15, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x15, #0x90]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xb8]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x15, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xc8]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x15, #0xb0]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xd8]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x15, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x15, #0xd0]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr d21, [x15, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + "add x12, x12, #0x10\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr d20, [x15, #0xf0]\n" + "mov v20.d[1], x20\n" + "add x11, x11, #0x10\n" + "add x10, x10, #0x10\n" + "add x15, x15, #0x100\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + "ldr x20, [x15, #0x8]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + "ldr x23, [x12, #0x8]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + "ldr d6, [x15, #0x0]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + "ldr d0, [x12, #0x0]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + "ldr d1, [x11, #0x0]\n" + "ldr x22, [x11, #0x8]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" + "ldr d2, [x10, #0x0]\n" + "sub x13, x13, #0x10\n" + "ldr d7, [x15, #0x10]\n" + "cmp x13, #0x20\n" + "ldr x21, [x10, #0x8]\n" "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x15, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x15, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr d6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "add x12, x12, #0x10\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" - "add x15, x15, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x20, [x15, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x10, [x12, #0x8]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - "ldr d6, [x15, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - "sub x13, x13, #0x10\n" - "ldr d7, [x15, #0x10]\n" - "cmp x13, #0x20\n" - "ldr x26, [x27, #0x8]\n" - "mov v6.d[1], x20\n" - "ldr x11, [x15, #0x18]\n" - "mov v0.d[1], x10\n" + "ldr x20, [x15, #0x18]\n" + "mov v0.d[1], x23\n" "prfm pldl1keep, [x12, #0x80]\n" - "mov v1.d[1], x28\n" - "prfm pldl1keep, [x9, #0x80]\n" - "mov v2.d[1], x26\n" - "prfm pldl1keep, [x27, #0x80]\n" - "mov v7.d[1], x11\n" + "mov v1.d[1], x22\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v2.d[1], x21\n" + "prfm pldl1keep, [x10, #0x80]\n" + "mov v7.d[1], x20\n" "bge 61b\n" "62:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q21, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x27, x27, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x13, x13, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q20, [x15, #0x30]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x15, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x15, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x15, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x15, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x15, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x15, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x15, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x15, #0x50]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x15, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x15, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x15, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x15, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x15, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x15, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x15, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x15, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x15, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x15, #0xf0]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" "add x15, x15, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "63:" // Height 3: Multiply loop: Main loop skip "cbz x13, 68f\n" "cmp x13, #0x4\n" "blt 65f\n" "64:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" + "ldr s24, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s23, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s22, [x10], #0x4\n" + "ldr q21, [x15, #0x0]\n" + ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" + "ldr q20, [x15, #0x10]\n" + ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x15, #0x20]\n" + ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x15, #0x30]\n" + ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" "bge 64b\n" "65:" // Height 3: Multiply loop: Skip odd blocks "cbz x13, 68f\n" "tbz x13, #1, 66f\n" "ldr h0, [x12], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" + "ldr h1, [x11], #0x2\n" + "ldr h2, [x10], #0x2\n" "tbz x13, #0, 67f\n" "ld1 { v0.b }[2], [x12]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" + "ld1 { v1.b }[2], [x11]\n" + "ld1 { v2.b }[2], [x10]\n" "b 67f\n" "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" + "ldr b1, [x11, #0x0]\n" + "ldr b2, [x10, #0x0]\n" "67:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q21, [x15, #0x0]\n" + ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" + "ldr q20, [x15, #0x10]\n" + ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x15, #0x20]\n" + ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x15, #0x30]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" "68:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 58b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q23, [x6, #0x0]\n" + "add v8.4s, v8.4s, v23.4s\n" + "ldr q22, [x6, #0x10]\n" + "add v9.4s, v9.4s, v22.4s\n" + "ldr q21, [x6, #0x20]\n" + "add v10.4s, v10.4s, v21.4s\n" + "ldr q20, [x6, #0x30]\n" + "add v11.4s, v11.4s, v20.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x17, x20\n" - "add x23, x24, x20\n" + "add x25, x17, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v12.4s, v12.4s, v23.4s\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v13.4s, v13.4s, v22.4s\n" + "add v14.4s, v14.4s, v21.4s\n" + "add v15.4s, v15.4s, v20.4s\n" + "add v16.4s, v16.4s, v23.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v20.4s\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 69f\n" "ldr q0, [x8, #0x0]\n" @@ -1188,10 +1187,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 70f\n" "69:" // Height 3: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1212,42 +1211,42 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v18.4s, v18.4s, v6.4s\n" "sqrdmulh v19.4s, v19.4s, v7.4s\n" "tbz %x[flags], #5, 71f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v8.16b, v0.16b\n" + "and v22.16b, v9.16b, v1.16b\n" + "and v21.16b, v10.16b, v2.16b\n" + "and v20.16b, v11.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v23.4s\n" + "sqadd v9.4s, v9.4s, v22.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sqadd v11.4s, v11.4s, v20.4s\n" + "and v23.16b, v12.16b, v0.16b\n" + "and v22.16b, v13.16b, v1.16b\n" + "and v21.16b, v14.16b, v2.16b\n" + "and v20.16b, v15.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v23.4s\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqadd v14.4s, v14.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v20.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v1.16b\n" + "and v21.16b, v18.16b, v2.16b\n" + "and v20.16b, v19.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "71:" // Height 3: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" @@ -1261,139 +1260,139 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v20.4s }, [x20]\n" + "add v8.4s, v8.4s, v20.4s\n" + "add v9.4s, v9.4s, v20.4s\n" + "add v10.4s, v10.4s, v20.4s\n" + "add v11.4s, v11.4s, v20.4s\n" + "add v12.4s, v12.4s, v20.4s\n" + "add v13.4s, v13.4s, v20.4s\n" + "add v14.4s, v14.4s, v20.4s\n" + "add v15.4s, v15.4s, v20.4s\n" + "add v16.4s, v16.4s, v20.4s\n" + "add v17.4s, v17.4s, v20.4s\n" + "add v18.4s, v18.4s, v20.4s\n" + "add v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v20.4s }, [x20]\n" + "smin v8.4s, v8.4s, v20.4s\n" + "smin v9.4s, v9.4s, v20.4s\n" + "smin v10.4s, v10.4s, v20.4s\n" + "smin v11.4s, v11.4s, v20.4s\n" + "smin v12.4s, v12.4s, v20.4s\n" + "smin v13.4s, v13.4s, v20.4s\n" + "smin v14.4s, v14.4s, v20.4s\n" + "smin v15.4s, v15.4s, v20.4s\n" + "smin v16.4s, v16.4s, v20.4s\n" + "smin v17.4s, v17.4s, v20.4s\n" + "smin v18.4s, v18.4s, v20.4s\n" + "smin v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" + "smax v8.4s, v8.4s, v20.4s\n" + "smax v9.4s, v9.4s, v20.4s\n" + "smax v10.4s, v10.4s, v20.4s\n" + "smax v11.4s, v11.4s, v20.4s\n" + "smax v12.4s, v12.4s, v20.4s\n" + "smax v13.4s, v13.4s, v20.4s\n" + "smax v14.4s, v14.4s, v20.4s\n" + "smax v15.4s, v15.4s, v20.4s\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v21.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v20.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v8.16b, v8.16b, v21.16b\n" + "uzp1 v12.16b, v12.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 80f\n" "tbz x16, #3, 75f\n" "str d8, [x17], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" "tbz x16, #2, 73f\n" "st1 { v8.s }[2], [x17], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" "tbz x16, #1, 72f\n" "st1 { v8.h }[6], [x17], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" "tbz x16, #0, 79f\n" "st1 { v8.b }[14], [x17]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" "b 79f\n" "72:" // Height 3: Partial direct writeback: partial_1_12 "tbz x16, #0, 79f\n" "st1 { v8.b }[12], [x17]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" "b 79f\n" "73:" // Height 3: Partial direct writeback: partial_2_8 "tbz x16, #1, 74f\n" "st1 { v8.h }[4], [x17], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" "tbz x16, #0, 79f\n" "st1 { v8.b }[10], [x17]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" "b 79f\n" "74:" // Height 3: Partial direct writeback: partial_1_8 "tbz x16, #0, 79f\n" "st1 { v8.b }[8], [x17]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" "b 79f\n" "75:" // Height 3: Partial direct writeback: partial_4_0 "tbz x16, #2, 77f\n" "str s8, [x17], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" "tbz x16, #1, 76f\n" "st1 { v8.h }[2], [x17], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" "tbz x16, #0, 79f\n" "st1 { v8.b }[6], [x17]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" "b 79f\n" "76:" // Height 3: Partial direct writeback: partial_1_4 "tbz x16, #0, 79f\n" "st1 { v8.b }[4], [x17]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" "b 79f\n" "77:" // Height 3: Partial direct writeback: partial_2_0 "tbz x16, #1, 78f\n" "str h8, [x17], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" "tbz x16, #0, 79f\n" "st1 { v8.b }[2], [x17]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" "b 79f\n" "78:" // Height 3: Partial direct writeback: partial_1_0 "str b8, [x17, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" "79:" // Height 3: Partial direct writeback: Done "b 81f\n" "80:" // Height 3: Full writeback "str q8, [x17, #0x0]\n" "add x17, x17, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" "81:" // Height 3: Writeback done "subs x16, x16, #0x10\n" "bgt 56b\n" @@ -1427,369 +1426,369 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "85:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 86f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x11, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x9, [x20, #0x18]\n" "cbnz x14, 87f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" "b 87f\n" "86:" // Height 4: setup direct input "mov x12, %x[input_ptr]\n" - "add x9, x12, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" "87:" // Height 4: input setup done "cmp x13, #0x10\n" "blt 90f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" + "ldr q1, [x11, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x9, #0x0]\n" "ldr q6, [x15, #0x0]\n" "ldr q7, [x15, #0x10]\n" "blt 89f\n" "88:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x20, [x15, #0x28]\n" + "ldr x21, [x15, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x15, #0x20]\n" + "ldr d25, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" + "mov v25.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x48]\n" + "ldr x21, [x15, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x15, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x15, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x10, [x12, #0x8]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x15, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d24, [x15, #0x30]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + "ldr x20, [x15, #0x58]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr d25, [x15, #0x40]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + "ldr x21, [x15, #0x68]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + "add x9, x9, #0x10\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr d24, [x15, #0x50]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x78]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + "ldr x25, [x12, #0x8]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x15, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x15, #0x88]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x15, #0x70]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x98]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x15, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + "ldr x22, [x9, #0x8]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x15, #0x90]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xb8]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" "sub x13, x13, #0x10\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x15, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xc8]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" "cmp x13, #0x20\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x15, #0xb0]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xd8]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x15, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x15, #0xd0]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr d25, [x15, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x15, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr d6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x11\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr d24, [x15, #0xf0]\n" + "mov v24.d[1], x20\n" "add x15, x15, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x20, [x15, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0x18]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + "ldr x21, [x15, #0x8]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x15, #0x18]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" "ldr d6, [x15, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + "ldr d1, [x11, #0x0]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + "ldr d2, [x10, #0x0]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" + "ldr d3, [x9, #0x0]\n" "ldr d7, [x15, #0x10]\n" - "mov v6.d[1], x20\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v7.d[1], x11\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x24\n" + "mov v2.d[1], x23\n" + "mov v3.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 88b\n" "89:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr q25, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x25, x25, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x13, x13, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q24, [x15, #0x30]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x15, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x15, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x15, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x15, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x15, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x15, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x15, #0x40]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x15, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x15, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x15, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x15, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x15, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x15, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x15, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x15, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x15, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x15, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x15, #0xf0]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" "add x15, x15, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "90:" // Height 4: Multiply loop: Main loop skip "cbz x13, 95f\n" "cmp x13, #0x4\n" "blt 92f\n" "91:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" + "ldr s29, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s28, [x11], #0x4\n" "cmp x13, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s27, [x10], #0x4\n" + "ldr s26, [x9], #0x4\n" + "ldr q25, [x15, #0x0]\n" + ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" + "ldr q24, [x15, #0x10]\n" + ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x15, #0x20]\n" + ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x15, #0x30]\n" + ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" "bge 91b\n" "92:" // Height 4: Multiply loop: Skip odd blocks "cbz x13, 95f\n" "tbz x13, #1, 93f\n" "ldr h0, [x12], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" + "ldr h1, [x11], #0x2\n" + "ldr h2, [x10], #0x2\n" + "ldr h3, [x9], #0x2\n" "tbz x13, #0, 94f\n" "ld1 { v0.b }[2], [x12]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x11]\n" + "ld1 { v2.b }[2], [x10]\n" + "ld1 { v3.b }[2], [x9]\n" "b 94f\n" "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" + "ldr b1, [x11, #0x0]\n" + "ldr b2, [x10, #0x0]\n" + "ldr b3, [x9, #0x0]\n" "94:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q25, [x15, #0x0]\n" + ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" + "ldr q24, [x15, #0x10]\n" + ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x15, #0x20]\n" + ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x15, #0x30]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" "95:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 85b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q27, [x6, #0x0]\n" + "add v8.4s, v8.4s, v27.4s\n" + "ldr q26, [x6, #0x10]\n" + "add v9.4s, v9.4s, v26.4s\n" + "ldr q25, [x6, #0x20]\n" + "add v10.4s, v10.4s, v25.4s\n" + "ldr q24, [x6, #0x30]\n" + "add v11.4s, v11.4s, v24.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x17, x20\n" + "add x25, x17, x20\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" - "add x22, x23, x20\n" "prfm pstl1keep, [x17, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" + "add v12.4s, v12.4s, v27.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v13.4s, v13.4s, v26.4s\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v25.4s\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v14.4s, v14.4s, v2.4s\n" - "prfm pstl1keep, [x22, #0x0]\n" - "add v15.4s, v15.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v15.4s, v15.4s, v24.4s\n" + "add v16.4s, v16.4s, v27.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v27.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v23.4s, v23.4s, v24.4s\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 96f\n" "ldr q0, [x8, #0x0]\n" @@ -1804,10 +1803,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 97f\n" "96:" // Height 4: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1832,54 +1831,54 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v22.4s, v22.4s, v6.4s\n" "sqrdmulh v23.4s, v23.4s, v7.4s\n" "tbz %x[flags], #5, 98f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" + "and v27.16b, v8.16b, v0.16b\n" + "and v26.16b, v9.16b, v1.16b\n" + "and v25.16b, v10.16b, v2.16b\n" + "and v24.16b, v11.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "sqadd v9.4s, v9.4s, v26.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v24.4s\n" + "and v27.16b, v12.16b, v0.16b\n" + "and v26.16b, v13.16b, v1.16b\n" + "and v25.16b, v14.16b, v2.16b\n" + "and v24.16b, v15.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v27.4s\n" + "sqadd v13.4s, v13.4s, v26.4s\n" + "sqadd v14.4s, v14.4s, v25.4s\n" + "sqadd v15.4s, v15.4s, v24.4s\n" + "and v27.16b, v16.16b, v0.16b\n" + "and v26.16b, v17.16b, v1.16b\n" + "and v25.16b, v18.16b, v2.16b\n" + "and v24.16b, v19.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v27.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "sqadd v18.4s, v18.4s, v25.4s\n" + "sqadd v19.4s, v19.4s, v24.4s\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v1.16b\n" + "and v25.16b, v22.16b, v2.16b\n" + "and v24.16b, v23.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "98:" // Height 4: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" @@ -1897,170 +1896,170 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v21.4s, v21.4s, v1.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v24.4s }, [x20]\n" + "add v8.4s, v8.4s, v24.4s\n" + "add v9.4s, v9.4s, v24.4s\n" + "add v10.4s, v10.4s, v24.4s\n" + "add v11.4s, v11.4s, v24.4s\n" + "add v12.4s, v12.4s, v24.4s\n" + "add v13.4s, v13.4s, v24.4s\n" + "add v14.4s, v14.4s, v24.4s\n" + "add v15.4s, v15.4s, v24.4s\n" + "add v16.4s, v16.4s, v24.4s\n" + "add v17.4s, v17.4s, v24.4s\n" + "add v18.4s, v18.4s, v24.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v24.4s\n" + "add v21.4s, v21.4s, v24.4s\n" + "add v22.4s, v22.4s, v24.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v24.4s }, [x20]\n" + "smin v8.4s, v8.4s, v24.4s\n" + "smin v9.4s, v9.4s, v24.4s\n" + "smin v10.4s, v10.4s, v24.4s\n" + "smin v11.4s, v11.4s, v24.4s\n" + "smin v12.4s, v12.4s, v24.4s\n" + "smin v13.4s, v13.4s, v24.4s\n" + "smin v14.4s, v14.4s, v24.4s\n" + "smin v15.4s, v15.4s, v24.4s\n" + "smin v16.4s, v16.4s, v24.4s\n" + "smin v17.4s, v17.4s, v24.4s\n" + "smin v18.4s, v18.4s, v24.4s\n" + "smin v19.4s, v19.4s, v24.4s\n" + "smin v20.4s, v20.4s, v24.4s\n" + "smin v21.4s, v21.4s, v24.4s\n" + "smin v22.4s, v22.4s, v24.4s\n" + "smin v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" + "smax v8.4s, v8.4s, v24.4s\n" + "smax v9.4s, v9.4s, v24.4s\n" + "smax v10.4s, v10.4s, v24.4s\n" + "smax v11.4s, v11.4s, v24.4s\n" + "smax v12.4s, v12.4s, v24.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smax v14.4s, v14.4s, v24.4s\n" + "smax v15.4s, v15.4s, v24.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v25.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v24.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v8.16b, v8.16b, v25.16b\n" + "uzp1 v12.16b, v12.16b, v24.16b\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 107f\n" "tbz x16, #3, 102f\n" "str d8, [x17], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" "tbz x16, #2, 100f\n" "st1 { v8.s }[2], [x17], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" "tbz x16, #1, 99f\n" "st1 { v8.h }[6], [x17], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" "tbz x16, #0, 106f\n" "st1 { v8.b }[14], [x17]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" "b 106f\n" "99:" // Height 4: Partial direct writeback: partial_1_12 "tbz x16, #0, 106f\n" "st1 { v8.b }[12], [x17]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" "b 106f\n" "100:" // Height 4: Partial direct writeback: partial_2_8 "tbz x16, #1, 101f\n" "st1 { v8.h }[4], [x17], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" "tbz x16, #0, 106f\n" "st1 { v8.b }[10], [x17]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" "b 106f\n" "101:" // Height 4: Partial direct writeback: partial_1_8 "tbz x16, #0, 106f\n" "st1 { v8.b }[8], [x17]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" "b 106f\n" "102:" // Height 4: Partial direct writeback: partial_4_0 "tbz x16, #2, 104f\n" "str s8, [x17], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" "tbz x16, #1, 103f\n" "st1 { v8.h }[2], [x17], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" "tbz x16, #0, 106f\n" "st1 { v8.b }[6], [x17]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" "b 106f\n" "103:" // Height 4: Partial direct writeback: partial_1_4 "tbz x16, #0, 106f\n" "st1 { v8.b }[4], [x17]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" "b 106f\n" "104:" // Height 4: Partial direct writeback: partial_2_0 "tbz x16, #1, 105f\n" "str h8, [x17], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" "tbz x16, #0, 106f\n" "st1 { v8.b }[2], [x17]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" "b 106f\n" "105:" // Height 4: Partial direct writeback: partial_1_0 "str b8, [x17, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" "106:" // Height 4: Partial direct writeback: Done "b 108f\n" "107:" // Height 4: Full writeback "str q8, [x17, #0x0]\n" "add x17, x17, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" "108:" // Height 4: Writeback done "subs x16, x16, #0x10\n" "bgt 83b\n" @@ -2089,439 +2088,439 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "movi v21.4s, #0x0\n" "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "111:" // Height 5: setup done - "mov x14, #0x0\n" - "112:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 113f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "cbnz x14, 114f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x12, x12, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "b 114f\n" - "113:" // Height 5: setup direct input - "mov x12, %x[input_ptr]\n" - "add x9, x12, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "114:" // Height 5: input setup done - "cmp x13, #0x10\n" - "blt 117f\n" - "ldr q0, [x12, #0x0]\n" - "cmp x13, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q6, [x15, #0x0]\n" - "ldr q7, [x15, #0x10]\n" - "blt 116f\n" - "115:" // Height 5: Multiply loop: Main loop head - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x20, [x15, #0x28]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x38]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x12, x12, #0x10\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x48]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x15, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x10, [x12, #0x8]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x15, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr x22, [x23, #0x8]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "sub x13, x13, #0x10\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "cmp x13, #0x20\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x15, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x15, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr d6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x20\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x11\n" - "add x15, x15, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x20, [x15, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0x18]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" - "ldr d6, [x15, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - "ldr d0, [x12, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" - "ldr d7, [x15, #0x10]\n" - "mov v6.d[1], x20\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v7.d[1], x11\n" - "bge 115b\n" - "116:" // Height 5: Multiply loop: Single iteration only - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "add x12, x12, #0x10\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "sub x13, x13, #0x10\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "prfm pldl1keep, [x12, #0x80]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x15, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x15, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x15, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x15, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x15, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x15, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x15, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x15, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x15, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x15, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x15, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x15, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "add x15, x15, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "117:" // Height 5: Multiply loop: Main loop skip - "cbz x13, 122f\n" - "cmp x13, #0x4\n" - "blt 119f\n" - "118:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" - "sub x13, x13, #0x4\n" - "ldr s1, [x9], #0x4\n" - "cmp x13, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "111:" // Height 5: setup done + "mov x14, #0x0\n" + "112:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w13, [x20, x14, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 113f\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x11, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x9, [x20, #0x18]\n" + "ldr x28, [x20, #0x20]\n" + "cbnz x14, 114f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" + "b 114f\n" + "113:" // Height 5: setup direct input + "mov x12, %x[input_ptr]\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "add x28, x9, x21\n" + "114:" // Height 5: input setup done + "cmp x13, #0x10\n" + "blt 117f\n" + "ldr q0, [x12, #0x0]\n" + "cmp x13, #0x20\n" + "ldr q1, [x11, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x9, #0x0]\n" + "ldr q4, [x28, #0x0]\n" "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "ldr q7, [x15, #0x10]\n" + "blt 116f\n" + "115:" // Height 5: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr x21, [x15, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr x20, [x15, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x12, x12, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "add x11, x11, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x15, #0x20]\n" + "ldr d29, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr x21, [x15, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "add x10, x10, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "add x9, x9, #0x10\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr d28, [x15, #0x30]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + "ldr x20, [x15, #0x58]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + "ldr x26, [x12, #0x8]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr d29, [x15, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + "ldr x21, [x15, #0x68]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + "ldr x25, [x11, #0x8]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + "ldr x24, [x10, #0x8]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr d28, [x15, #0x50]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x78]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + "ldr x23, [x9, #0x8]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + "ldr x22, [x28, #0x8]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x15, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x15, #0x88]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + "sub x13, x13, #0x10\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + "cmp x13, #0x20\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x15, #0x70]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x15, #0x98]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x15, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x15, #0xa8]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x15, #0x90]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xb8]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x15, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xc8]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x15, #0xb0]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x15, #0xd8]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x15, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x15, #0xe8]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x15, #0xd0]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x15, #0xf8]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr d29, [x15, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr d28, [x15, #0xf0]\n" + "mov v28.d[1], x20\n" + "add x15, x15, #0x100\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + "ldr x21, [x15, #0x8]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x15, #0x18]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + "ldr d6, [x15, #0x0]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + "ldr d0, [x12, #0x0]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + "ldr d1, [x11, #0x0]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + "ldr d2, [x10, #0x0]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + "ldr d3, [x9, #0x0]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" + "ldr d4, [x28, #0x0]\n" + "ldr d7, [x15, #0x10]\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x26\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" + "mov v4.d[1], x22\n" + "mov v7.d[1], x20\n" + "bge 115b\n" + "116:" // Height 5: Multiply loop: Single iteration only + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "add x12, x12, #0x10\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "add x11, x11, #0x10\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "add x9, x9, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q29, [x15, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x28, x28, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "sub x13, x13, #0x10\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q28, [x15, #0x30]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + "prfm pldl1keep, [x28, #0x80]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x15, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x15, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x15, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x15, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x15, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x15, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x15, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x15, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x15, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x15, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x15, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x15, #0xf0]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + "add x15, x15, #0x100\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" + "117:" // Height 5: Multiply loop: Main loop skip + "cbz x13, 122f\n" + "cmp x13, #0x4\n" + "blt 119f\n" + "118:" // Height 5: Multiply loop: Odd block loop + "ldr s2, [x12], #0x4\n" + "sub x13, x13, #0x4\n" + "ldr s1, [x11], #0x4\n" + "cmp x13, #0x4\n" + "ldr s0, [x10], #0x4\n" + "ldr s31, [x9], #0x4\n" + "ldr s30, [x28], #0x4\n" + "ldr q29, [x15, #0x0]\n" + ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" + "ldr q28, [x15, #0x10]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x15, #0x20]\n" + ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x15, #0x30]\n" + ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" "bge 118b\n" "119:" // Height 5: Multiply loop: Skip odd blocks "cbz x13, 122f\n" "tbz x13, #1, 120f\n" "ldr h0, [x12], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" + "ldr h1, [x11], #0x2\n" + "ldr h2, [x10], #0x2\n" + "ldr h3, [x9], #0x2\n" + "ldr h4, [x28], #0x2\n" "tbz x13, #0, 121f\n" "ld1 { v0.b }[2], [x12]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x11]\n" + "ld1 { v2.b }[2], [x10]\n" + "ld1 { v3.b }[2], [x9]\n" + "ld1 { v4.b }[2], [x28]\n" "b 121f\n" "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" + "ldr b1, [x11, #0x0]\n" + "ldr b2, [x10, #0x0]\n" + "ldr b3, [x9, #0x0]\n" + "ldr b4, [x28, #0x0]\n" "121:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q29, [x15, #0x0]\n" + ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" + "ldr q28, [x15, #0x10]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x15, #0x20]\n" + ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x15, #0x30]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" "122:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 112b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q31, [x6, #0x0]\n" + "add v8.4s, v8.4s, v31.4s\n" + "ldr q30, [x6, #0x10]\n" + "add v9.4s, v9.4s, v30.4s\n" + "ldr q29, [x6, #0x20]\n" + "add v10.4s, v10.4s, v29.4s\n" + "ldr q28, [x6, #0x30]\n" + "add v11.4s, v11.4s, v28.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x17, x20\n" + "add x25, x17, x20\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" "add x22, x23, x20\n" - "add x21, x22, x20\n" "prfm pstl1keep, [x17, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v12.4s, v12.4s, v31.4s\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v30.4s\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v29.4s\n" "prfm pstl1keep, [x22, #0x0]\n" - "add v14.4s, v14.4s, v2.4s\n" - "prfm pstl1keep, [x21, #0x0]\n" - "add v15.4s, v15.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "add v15.4s, v15.4s, v28.4s\n" + "add v16.4s, v16.4s, v31.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v31.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v31.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 123f\n" "ldr q0, [x8, #0x0]\n" @@ -2536,10 +2535,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 124f\n" "123:" // Height 5: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -2568,66 +2567,66 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v27.4s, v27.4s, v7.4s\n" "tbz %x[flags], #5, 125f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" + "and v31.16b, v8.16b, v0.16b\n" + "and v30.16b, v9.16b, v1.16b\n" + "and v29.16b, v10.16b, v2.16b\n" + "and v28.16b, v11.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v31.4s\n" + "sqadd v9.4s, v9.4s, v30.4s\n" + "sqadd v10.4s, v10.4s, v29.4s\n" + "sqadd v11.4s, v11.4s, v28.4s\n" + "and v31.16b, v12.16b, v0.16b\n" + "and v30.16b, v13.16b, v1.16b\n" + "and v29.16b, v14.16b, v2.16b\n" + "and v28.16b, v15.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v31.4s\n" + "sqadd v13.4s, v13.4s, v30.4s\n" + "sqadd v14.4s, v14.4s, v29.4s\n" + "sqadd v15.4s, v15.4s, v28.4s\n" + "and v31.16b, v16.16b, v0.16b\n" + "and v30.16b, v17.16b, v1.16b\n" + "and v29.16b, v18.16b, v2.16b\n" + "and v28.16b, v19.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v31.4s\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "and v31.16b, v20.16b, v0.16b\n" + "and v30.16b, v21.16b, v1.16b\n" + "and v29.16b, v22.16b, v2.16b\n" + "and v28.16b, v23.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v31.4s\n" + "sqadd v21.4s, v21.4s, v30.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sqadd v23.4s, v23.4s, v28.4s\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v1.16b\n" + "and v29.16b, v26.16b, v2.16b\n" + "and v28.16b, v27.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "125:" // Height 5: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" @@ -2649,201 +2648,201 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v28.4s }, [x20]\n" + "add v8.4s, v8.4s, v28.4s\n" + "add v9.4s, v9.4s, v28.4s\n" + "add v10.4s, v10.4s, v28.4s\n" + "add v11.4s, v11.4s, v28.4s\n" + "add v12.4s, v12.4s, v28.4s\n" + "add v13.4s, v13.4s, v28.4s\n" + "add v14.4s, v14.4s, v28.4s\n" + "add v15.4s, v15.4s, v28.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "add v18.4s, v18.4s, v28.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v28.4s\n" + "add v21.4s, v21.4s, v28.4s\n" + "add v22.4s, v22.4s, v28.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v28.4s\n" + "add v25.4s, v25.4s, v28.4s\n" + "add v26.4s, v26.4s, v28.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" + "smin v8.4s, v8.4s, v28.4s\n" + "smin v9.4s, v9.4s, v28.4s\n" + "smin v10.4s, v10.4s, v28.4s\n" + "smin v11.4s, v11.4s, v28.4s\n" + "smin v12.4s, v12.4s, v28.4s\n" + "smin v13.4s, v13.4s, v28.4s\n" + "smin v14.4s, v14.4s, v28.4s\n" + "smin v15.4s, v15.4s, v28.4s\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v23.4s, v23.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" + "smax v8.4s, v8.4s, v28.4s\n" + "smax v9.4s, v9.4s, v28.4s\n" + "smax v10.4s, v10.4s, v28.4s\n" + "smax v11.4s, v11.4s, v28.4s\n" + "smax v12.4s, v12.4s, v28.4s\n" + "smax v13.4s, v13.4s, v28.4s\n" + "smax v14.4s, v14.4s, v28.4s\n" + "smax v15.4s, v15.4s, v28.4s\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v29.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v28.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v8.16b, v8.16b, v29.16b\n" + "uzp1 v12.16b, v12.16b, v28.16b\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 134f\n" "tbz x16, #3, 129f\n" "str d8, [x17], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x16, #2, 127f\n" "st1 { v8.s }[2], [x17], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x16, #1, 126f\n" "st1 { v8.h }[6], [x17], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x16, #0, 133f\n" "st1 { v8.b }[14], [x17]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 133f\n" "126:" // Height 5: Partial direct writeback: partial_1_12 "tbz x16, #0, 133f\n" "st1 { v8.b }[12], [x17]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 133f\n" "127:" // Height 5: Partial direct writeback: partial_2_8 "tbz x16, #1, 128f\n" "st1 { v8.h }[4], [x17], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x16, #0, 133f\n" "st1 { v8.b }[10], [x17]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 133f\n" "128:" // Height 5: Partial direct writeback: partial_1_8 "tbz x16, #0, 133f\n" "st1 { v8.b }[8], [x17]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 133f\n" "129:" // Height 5: Partial direct writeback: partial_4_0 "tbz x16, #2, 131f\n" "str s8, [x17], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x16, #1, 130f\n" "st1 { v8.h }[2], [x17], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x16, #0, 133f\n" "st1 { v8.b }[6], [x17]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 133f\n" "130:" // Height 5: Partial direct writeback: partial_1_4 "tbz x16, #0, 133f\n" "st1 { v8.b }[4], [x17]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 133f\n" "131:" // Height 5: Partial direct writeback: partial_2_0 "tbz x16, #1, 132f\n" "str h8, [x17], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x16, #0, 133f\n" "st1 { v8.b }[2], [x17]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 133f\n" "132:" // Height 5: Partial direct writeback: partial_1_0 "str b8, [x17, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "133:" // Height 5: Partial direct writeback: Done "b 135f\n" "134:" // Height 5: Full writeback "str q8, [x17, #0x0]\n" "add x17, x17, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "135:" // Height 5: Writeback done "subs x16, x16, #0x10\n" "bgt 110b\n" @@ -2888,191 +2887,191 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "139:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w13, [x20, x14, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 140f\n" - "ldr x21, [%x[input_ptr], x14, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x12, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x12, [x20, #0x0]\n" + "ldr x11, [x20, #0x8]\n" + "ldr x10, [x20, #0x10]\n" + "ldr x9, [x20, #0x18]\n" + "ldr x28, [x20, #0x20]\n" + "ldr x27, [x20, #0x28]\n" "cbnz x14, 141f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "add x9, x9, x20\n" + "add x28, x28, x20\n" "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "add x21, x21, x20\n" "b 141f\n" "140:" // Height 6: setup direct input "mov x12, %x[input_ptr]\n" - "add x9, x12, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "add x21, x23, x20\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "add x28, x9, x21\n" + "add x27, x28, x21\n" "141:" // Height 6: input setup done "cmp x13, #0x10\n" "blt 144f\n" "ldr q0, [x12, #0x0]\n" "cmp x13, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x21, #0x0]\n" + "ldr q1, [x11, #0x0]\n" + "ldr q2, [x10, #0x0]\n" + "ldr q3, [x9, #0x0]\n" + "ldr q4, [x28, #0x0]\n" + "ldr q5, [x27, #0x0]\n" "ldr q6, [x15, #0x0]\n" "ldr q7, [x15, #0x10]\n" "blt 143f\n" "142:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x20, [x15, #0x28]\n" + "ldr x21, [x15, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x38]\n" + "ldr x20, [x15, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "add x27, x27, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x48]\n" + "ldr x21, [x15, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "add x21, x21, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x15, #0x30]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x15, #0x58]\n" + "ldr x20, [x15, #0x58]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr x10, [x12, #0x8]\n" + "ldr x26, [x12, #0x8]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x25, [x11, #0x8]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr x26, [x27, #0x8]\n" + "ldr x24, [x10, #0x8]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x15, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x20, [x15, #0x68]\n" + "ldr x21, [x15, #0x68]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x23, [x9, #0x8]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" "sub x13, x13, #0x10\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" "cmp x13, #0x20\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x15, #0x50]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x78]\n" + "ldr x20, [x15, #0x78]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x15, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0x88]\n" + "ldr x21, [x15, #0x88]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x15, #0x70]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x15, #0x98]\n" + "ldr x20, [x15, #0x98]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x15, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x20, [x15, #0xa8]\n" + "ldr x21, [x15, #0xa8]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x15, #0x90]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xb8]\n" + "ldr x20, [x15, #0xb8]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x15, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xc8]\n" + "ldr x21, [x15, #0xc8]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x15, #0xb0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x15, #0xd8]\n" + "ldr x20, [x15, #0xd8]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x15, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x20, [x15, #0xe8]\n" + "ldr x21, [x15, #0xe8]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x15, #0xd0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x15, #0xf8]\n" + "ldr x20, [x15, #0xf8]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" "ldr d6, [x15, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x20\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x22, [x28, #0x8]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" "ldr d7, [x15, #0xf0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "add x15, x15, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" "ldr x20, [x15, #0x8]\n" @@ -3085,58 +3084,58 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x12, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x11, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x10, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d3, [x9, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" + "ldr d4, [x28, #0x0]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "ldr d5, [x21, #0x0]\n" + "ldr d5, [x27, #0x0]\n" "ldr d7, [x15, #0x10]\n" "mov v6.d[1], x20\n" - "ldr x20, [x21, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x15, #0x18]\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" + "ldr x21, [x27, #0x8]\n" + "mov v0.d[1], x26\n" + "ldr x20, [x15, #0x18]\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" "mov v4.d[1], x22\n" - "mov v5.d[1], x20\n" - "mov v7.d[1], x11\n" + "mov v5.d[1], x21\n" + "mov v7.d[1], x20\n" "bge 142b\n" "143:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x12, x12, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "ldr q6, [x15, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x21, x21, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x13, x13, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" "ldr q7, [x15, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" @@ -3236,143 +3235,143 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "cmp x13, #0x4\n" "blt 146f\n" "145:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x12], #0x4\n" + "ldr s7, [x12], #0x4\n" "sub x13, x13, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s6, [x11], #0x4\n" "cmp x13, #0x4\n" + "ldr s5, [x10], #0x4\n" + "ldr s4, [x9], #0x4\n" + "ldr s3, [x28], #0x4\n" "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q1, [x15, #0x0]\n" + ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" + "ldr q0, [x15, #0x10]\n" + ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x15, #0x20]\n" + ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x15, #0x30]\n" + ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" "bge 145b\n" "146:" // Height 6: Multiply loop: Skip odd blocks "cbz x13, 149f\n" "tbz x13, #1, 147f\n" "ldr h0, [x12], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" - "ldr h5, [x21], #0x2\n" + "ldr h1, [x11], #0x2\n" + "ldr h2, [x10], #0x2\n" + "ldr h3, [x9], #0x2\n" + "ldr h4, [x28], #0x2\n" + "ldr h5, [x27], #0x2\n" "tbz x13, #0, 148f\n" "ld1 { v0.b }[2], [x12]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" - "ld1 { v5.b }[2], [x21]\n" + "ld1 { v1.b }[2], [x11]\n" + "ld1 { v2.b }[2], [x10]\n" + "ld1 { v3.b }[2], [x9]\n" + "ld1 { v4.b }[2], [x28]\n" + "ld1 { v5.b }[2], [x27]\n" "b 148f\n" "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x12, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" - "ldr b5, [x21, #0x0]\n" + "ldr b1, [x11, #0x0]\n" + "ldr b2, [x10, #0x0]\n" + "ldr b3, [x9, #0x0]\n" + "ldr b4, [x28, #0x0]\n" + "ldr b5, [x27, #0x0]\n" "148:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x15, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x15, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x15, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x15, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x15, #0x0]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x15, #0x10]\n" + ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x15, #0x20]\n" + ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x15, #0x30]\n" + ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" "add x15, x15, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" "149:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x14, x14, #0x1\n" "cmp x14, x20\n" "bne 139b\n" - "ldr q0, [x6, #0x0]\n" - "add v8.4s, v8.4s, v0.4s\n" - "ldr q1, [x6, #0x10]\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x6, #0x20]\n" - "add v10.4s, v10.4s, v2.4s\n" - "ldr q3, [x6, #0x30]\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q3, [x6, #0x0]\n" + "add v8.4s, v8.4s, v3.4s\n" + "ldr q2, [x6, #0x10]\n" + "add v9.4s, v9.4s, v2.4s\n" + "ldr q1, [x6, #0x20]\n" + "add v10.4s, v10.4s, v1.4s\n" + "ldr q0, [x6, #0x30]\n" + "add v11.4s, v11.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x17, x20\n" + "add x25, x17, x20\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" "add x22, x23, x20\n" "add x21, x22, x20\n" - "add x20, x21, x20\n" "prfm pstl1keep, [x17, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" + "add v12.4s, v12.4s, v3.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v13.4s, v13.4s, v2.4s\n" "prfm pstl1keep, [x24, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v1.4s\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v14.4s, v14.4s, v2.4s\n" + "add v15.4s, v15.4s, v0.4s\n" "prfm pstl1keep, [x22, #0x0]\n" - "add v15.4s, v15.4s, v3.4s\n" + "add v16.4s, v16.4s, v3.4s\n" "prfm pstl1keep, [x21, #0x0]\n" - "add v16.4s, v16.4s, v0.4s\n" - "prfm pstl1keep, [x20, #0x0]\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v0.4s\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" + "add v17.4s, v17.4s, v2.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v1.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v2.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v1.4s\n" + "add v31.4s, v31.4s, v0.4s\n" "add x6, x6, #0x40\n" "tbz %x[flags], #4, 150f\n" "ldr q0, [x8, #0x0]\n" @@ -3387,10 +3386,10 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "add x7, x7, #0x40\n" "b 151f\n" "150:" // Height 6: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -3423,78 +3422,78 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "sqrdmulh v30.4s, v30.4s, v6.4s\n" "sqrdmulh v31.4s, v31.4s, v7.4s\n" "tbz %x[flags], #5, 152f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v7.16b, v8.16b, v0.16b\n" + "and v6.16b, v9.16b, v1.16b\n" + "and v5.16b, v10.16b, v2.16b\n" + "and v4.16b, v11.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v7.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "sqadd v10.4s, v10.4s, v5.4s\n" + "sqadd v11.4s, v11.4s, v4.4s\n" + "and v7.16b, v12.16b, v0.16b\n" + "and v6.16b, v13.16b, v1.16b\n" + "and v5.16b, v14.16b, v2.16b\n" + "and v4.16b, v15.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v7.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "sqadd v14.4s, v14.4s, v5.4s\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "and v7.16b, v16.16b, v0.16b\n" + "and v6.16b, v17.16b, v1.16b\n" + "and v5.16b, v18.16b, v2.16b\n" + "and v4.16b, v19.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v5.16b, v29.16b, v1.16b\n" - "and v6.16b, v30.16b, v2.16b\n" - "and v7.16b, v31.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v7.4s\n" + "sqadd v17.4s, v17.4s, v6.4s\n" + "sqadd v18.4s, v18.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "and v7.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v1.16b\n" + "and v5.16b, v22.16b, v2.16b\n" + "and v4.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v7.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "sqadd v22.4s, v22.4s, v5.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v7.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v1.16b\n" + "and v5.16b, v26.16b, v2.16b\n" + "and v4.16b, v27.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v7.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "sqadd v27.4s, v27.4s, v4.4s\n" + "and v7.16b, v28.16b, v0.16b\n" + "and v6.16b, v29.16b, v1.16b\n" + "and v5.16b, v30.16b, v2.16b\n" + "and v4.16b, v31.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v5.4s\n" - "sqadd v30.4s, v30.4s, v6.4s\n" - "sqadd v31.4s, v31.4s, v7.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v7.4s\n" + "sqadd v29.4s, v29.4s, v6.4s\n" + "sqadd v30.4s, v30.4s, v5.4s\n" + "sqadd v31.4s, v31.4s, v4.4s\n" "152:" // Height 6: no shift correction "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" @@ -3520,232 +3519,232 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "srshl v29.4s, v29.4s, v1.4s\n" "srshl v30.4s, v30.4s, v2.4s\n" "srshl v31.4s, v31.4s, v3.4s\n" - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x20]\n" + "add v8.4s, v8.4s, v0.4s\n" + "add v9.4s, v9.4s, v0.4s\n" + "add v10.4s, v10.4s, v0.4s\n" + "add v11.4s, v11.4s, v0.4s\n" + "add v12.4s, v12.4s, v0.4s\n" + "add v13.4s, v13.4s, v0.4s\n" + "add v14.4s, v14.4s, v0.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v0.4s\n" + "add v18.4s, v18.4s, v0.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v0.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v0.4s\n" + "add v30.4s, v30.4s, v0.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v0.4s }, [x20]\n" + "smin v8.4s, v8.4s, v0.4s\n" + "smin v9.4s, v9.4s, v0.4s\n" + "smin v10.4s, v10.4s, v0.4s\n" + "smin v11.4s, v11.4s, v0.4s\n" + "smin v12.4s, v12.4s, v0.4s\n" + "smin v13.4s, v13.4s, v0.4s\n" + "smin v14.4s, v14.4s, v0.4s\n" + "smin v15.4s, v15.4s, v0.4s\n" + "smin v16.4s, v16.4s, v0.4s\n" + "smin v17.4s, v17.4s, v0.4s\n" + "smin v18.4s, v18.4s, v0.4s\n" + "smin v19.4s, v19.4s, v0.4s\n" + "smin v20.4s, v20.4s, v0.4s\n" + "smin v21.4s, v21.4s, v0.4s\n" + "smin v22.4s, v22.4s, v0.4s\n" + "smin v23.4s, v23.4s, v0.4s\n" + "smin v24.4s, v24.4s, v0.4s\n" + "smin v25.4s, v25.4s, v0.4s\n" + "smin v26.4s, v26.4s, v0.4s\n" + "smin v27.4s, v27.4s, v0.4s\n" + "smin v28.4s, v28.4s, v0.4s\n" + "smin v29.4s, v29.4s, v0.4s\n" + "smin v30.4s, v30.4s, v0.4s\n" + "smin v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v0.4s }, [x20]\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smax v13.4s, v13.4s, v0.4s\n" + "smax v14.4s, v14.4s, v0.4s\n" + "smax v15.4s, v15.4s, v0.4s\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smax v21.4s, v21.4s, v0.4s\n" + "smax v22.4s, v22.4s, v0.4s\n" + "smax v23.4s, v23.4s, v0.4s\n" + "smax v24.4s, v24.4s, v0.4s\n" + "smax v25.4s, v25.4s, v0.4s\n" + "smax v26.4s, v26.4s, v0.4s\n" + "smax v27.4s, v27.4s, v0.4s\n" + "smax v28.4s, v28.4s, v0.4s\n" + "smax v29.4s, v29.4s, v0.4s\n" + "smax v30.4s, v30.4s, v0.4s\n" + "smax v31.4s, v31.4s, v0.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v2.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v1.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" "cmp x16, #0x10\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v8.16b, v8.16b, v2.16b\n" + "uzp1 v12.16b, v12.16b, v1.16b\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 161f\n" "tbz x16, #3, 156f\n" "str d8, [x17], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" "tbz x16, #2, 154f\n" "st1 { v8.s }[2], [x17], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" "tbz x16, #1, 153f\n" "st1 { v8.h }[6], [x17], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" "tbz x16, #0, 160f\n" "st1 { v8.b }[14], [x17]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 160f\n" "153:" // Height 6: Partial direct writeback: partial_1_12 "tbz x16, #0, 160f\n" "st1 { v8.b }[12], [x17]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 160f\n" "154:" // Height 6: Partial direct writeback: partial_2_8 "tbz x16, #1, 155f\n" "st1 { v8.h }[4], [x17], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" "tbz x16, #0, 160f\n" "st1 { v8.b }[10], [x17]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 160f\n" "155:" // Height 6: Partial direct writeback: partial_1_8 "tbz x16, #0, 160f\n" "st1 { v8.b }[8], [x17]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 160f\n" "156:" // Height 6: Partial direct writeback: partial_4_0 "tbz x16, #2, 158f\n" "str s8, [x17], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" "tbz x16, #1, 157f\n" "st1 { v8.h }[2], [x17], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" "tbz x16, #0, 160f\n" "st1 { v8.b }[6], [x17]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 160f\n" "157:" // Height 6: Partial direct writeback: partial_1_4 "tbz x16, #0, 160f\n" "st1 { v8.b }[4], [x17]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 160f\n" "158:" // Height 6: Partial direct writeback: partial_2_0 "tbz x16, #1, 159f\n" "str h8, [x17], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" "tbz x16, #0, 160f\n" "st1 { v8.b }[2], [x17]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 160f\n" "159:" // Height 6: Partial direct writeback: partial_1_0 "str b8, [x17, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "160:" // Height 6: Partial direct writeback: Done "b 162f\n" "161:" // Height 6: Full writeback "str q8, [x17, #0x0]\n" "add x17, x17, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "162:" // Height 6: Writeback done "subs x16, x16, #0x10\n" "bgt 137b\n" @@ -3761,7 +3760,6 @@ void a64_hybrid_s8qs_dot_6x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "164:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp index 598d1524e8..f3942328a6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp @@ -85,7 +85,6 @@ void a64_hybrid_s8qs_dot_6x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 136f\n" @@ -111,11 +110,11 @@ void a64_hybrid_s8qs_dot_6x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -132,37 +131,37 @@ void a64_hybrid_s8qs_dot_6x16 ( "blt 8f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q17, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x9, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x9, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x20\n" "add x9, x9, #0x100\n" @@ -172,37 +171,37 @@ void a64_hybrid_s8qs_dot_6x16 ( "bge 7b\n" "8:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q17, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x9, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x9, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x9, x9, #0x100\n" "9:" // Height 1: Multiply loop: Main loop skip @@ -210,17 +209,17 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x27, #0x4\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x9, #0x0]\n" + ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n" "sub x27, x27, #0x4\n" - "ldr q7, [x9, #0x10]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x9, #0x10]\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" "cmp x27, #0x4\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" "add x9, x9, #0x40\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks @@ -233,28 +232,28 @@ void a64_hybrid_s8qs_dot_6x16 ( "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "13:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q17, [x9, #0x0]\n" + "ldr q16, [x9, #0x10]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q17, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "add x9, x9, #0x40\n" "14:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q17, [x14, #0x0]\n" + "ldr q16, [x14, #0x10]\n" + "add v8.4s, v8.4s, v17.4s\n" + "add v9.4s, v9.4s, v16.4s\n" + "ldr q17, [x14, #0x20]\n" + "ldr q16, [x14, #0x30]\n" + "add v10.4s, v10.4s, v17.4s\n" + "add v11.4s, v11.4s, v16.4s\n" "prfm pstl1keep, [x11, #0x0]\n" "add x14, x14, #0x40\n" "tbz %x[flags], #4, 15f\n" @@ -270,10 +269,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 16f\n" "15:" // Height 1: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -286,45 +285,45 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v10.4s, v10.4s, v6.4s\n" "sqrdmulh v11.4s, v11.4s, v7.4s\n" "tbz %x[flags], #5, 17f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" + "and v19.16b, v8.16b, v0.16b\n" + "and v18.16b, v9.16b, v1.16b\n" + "and v17.16b, v10.16b, v2.16b\n" + "and v16.16b, v11.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v19.4s\n" + "sqadd v9.4s, v9.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v17.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" "17:" // Height 1: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v18.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v17.4s }, [x20]\n" + "add v8.4s, v8.4s, v18.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "add v10.4s, v10.4s, v18.4s\n" + "add v11.4s, v11.4s, v18.4s\n" "cmp x10, #0x10\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" + "smin v8.4s, v8.4s, v17.4s\n" + "smin v9.4s, v9.4s, v17.4s\n" + "smin v10.4s, v10.4s, v17.4s\n" + "smin v11.4s, v11.4s, v17.4s\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v16.8h, v10.8h, v11.8h\n" + "uzp1 v8.16b, v8.16b, v16.16b\n" "bge 26f\n" "tbz x10, #3, 21f\n" "str d8, [x11], #0x8\n" @@ -399,12 +398,12 @@ void a64_hybrid_s8qs_dot_6x16 ( "31:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 32f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -412,7 +411,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "b 33f\n" "32:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "33:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 36f\n" @@ -425,137 +424,137 @@ void a64_hybrid_s8qs_dot_6x16 ( "34:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q17, [x9, #0x20]\n" "sub x27, x27, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q16, [x9, #0x30]\n" "add x26, x26, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x40]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x9, #0x40]\n" "add x25, x25, #0x10\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x50]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x9, #0x50]\n" "cmp x27, #0x20\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x9, #0x60]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x9, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x9, #0x70]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x9, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x9, #0x10]\n" "bge 34b\n" "35:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q17, [x9, #0x20]\n" "add x26, x26, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q16, [x9, #0x30]\n" "add x25, x25, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x40]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x9, #0x40]\n" "sub x27, x27, #0x10\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x50]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x9, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x9, #0x60]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x9, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x9, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "36:" // Height 2: Multiply loop: Main loop skip "cbz x27, 41f\n" "cmp x27, #0x4\n" "blt 38f\n" "37:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x9, #0x0]\n" + "ldr q16, [x9, #0x10]\n" + ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" + ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" + ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" "bge 37b\n" "38:" // Height 2: Multiply loop: Skip odd blocks "cbz x27, 41f\n" @@ -570,41 +569,41 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "40:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x9, #0x0]\n" + "ldr q16, [x9, #0x10]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" "41:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 31b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q19, [x14, #0x0]\n" + "ldr q18, [x14, #0x10]\n" + "add v8.4s, v8.4s, v19.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "ldr q17, [x14, #0x20]\n" + "ldr q16, [x14, #0x30]\n" + "add v10.4s, v10.4s, v17.4s\n" + "add v11.4s, v11.4s, v16.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" + "add x25, x11, x20\n" "prfm pstl1keep, [x11, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" + "add v12.4s, v12.4s, v19.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add v13.4s, v13.4s, v18.4s\n" + "add v14.4s, v14.4s, v17.4s\n" "add x14, x14, #0x40\n" - "add v15.4s, v15.4s, v3.4s\n" + "add v15.4s, v15.4s, v16.4s\n" "tbz %x[flags], #4, 42f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -618,10 +617,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 43f\n" "42:" // Height 2: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -638,141 +637,141 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v14.4s, v14.4s, v6.4s\n" "sqrdmulh v15.4s, v15.4s, v7.4s\n" "tbz %x[flags], #5, 44f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" + "and v19.16b, v8.16b, v0.16b\n" + "and v18.16b, v9.16b, v1.16b\n" + "and v17.16b, v10.16b, v2.16b\n" + "and v16.16b, v11.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v19.4s\n" + "sqadd v9.4s, v9.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v17.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "and v19.16b, v12.16b, v0.16b\n" + "and v18.16b, v13.16b, v1.16b\n" + "and v17.16b, v14.16b, v2.16b\n" + "and v16.16b, v15.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v19.4s\n" + "sqadd v13.4s, v13.4s, v18.4s\n" + "sqadd v14.4s, v14.4s, v17.4s\n" + "sqadd v15.4s, v15.4s, v16.4s\n" "44:" // Height 2: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v18.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v17.4s }, [x20]\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "cmp x10, #0x10\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" + "add v8.4s, v8.4s, v18.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add v10.4s, v10.4s, v18.4s\n" + "add v11.4s, v11.4s, v18.4s\n" + "add v12.4s, v12.4s, v18.4s\n" + "add v13.4s, v13.4s, v18.4s\n" + "add v14.4s, v14.4s, v18.4s\n" + "add v15.4s, v15.4s, v18.4s\n" + "smin v8.4s, v8.4s, v17.4s\n" + "smin v9.4s, v9.4s, v17.4s\n" + "smin v10.4s, v10.4s, v17.4s\n" + "smin v11.4s, v11.4s, v17.4s\n" + "smin v12.4s, v12.4s, v17.4s\n" + "smin v13.4s, v13.4s, v17.4s\n" + "smin v14.4s, v14.4s, v17.4s\n" + "smin v15.4s, v15.4s, v17.4s\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smax v12.4s, v12.4s, v16.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v14.4s, v14.4s, v16.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v17.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v16.8h, v14.8h, v15.8h\n" + "uzp1 v8.16b, v8.16b, v17.16b\n" + "uzp1 v12.16b, v12.16b, v16.16b\n" "bge 53f\n" "tbz x10, #3, 48f\n" "str d8, [x11], #0x8\n" - "str d12, [x24], #0x8\n" + "str d12, [x25], #0x8\n" "tbz x10, #2, 46f\n" "st1 { v8.s }[2], [x11], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" "tbz x10, #1, 45f\n" "st1 { v8.h }[6], [x11], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" "tbz x10, #0, 52f\n" "st1 { v8.b }[14], [x11]\n" - "st1 { v12.b }[14], [x24]\n" + "st1 { v12.b }[14], [x25]\n" "b 52f\n" "45:" // Height 2: Partial direct writeback: partial_1_12 "tbz x10, #0, 52f\n" "st1 { v8.b }[12], [x11]\n" - "st1 { v12.b }[12], [x24]\n" + "st1 { v12.b }[12], [x25]\n" "b 52f\n" "46:" // Height 2: Partial direct writeback: partial_2_8 "tbz x10, #1, 47f\n" "st1 { v8.h }[4], [x11], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" "tbz x10, #0, 52f\n" "st1 { v8.b }[10], [x11]\n" - "st1 { v12.b }[10], [x24]\n" + "st1 { v12.b }[10], [x25]\n" "b 52f\n" "47:" // Height 2: Partial direct writeback: partial_1_8 "tbz x10, #0, 52f\n" "st1 { v8.b }[8], [x11]\n" - "st1 { v12.b }[8], [x24]\n" + "st1 { v12.b }[8], [x25]\n" "b 52f\n" "48:" // Height 2: Partial direct writeback: partial_4_0 "tbz x10, #2, 50f\n" "str s8, [x11], #0x4\n" - "str s12, [x24], #0x4\n" + "str s12, [x25], #0x4\n" "tbz x10, #1, 49f\n" "st1 { v8.h }[2], [x11], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" "tbz x10, #0, 52f\n" "st1 { v8.b }[6], [x11]\n" - "st1 { v12.b }[6], [x24]\n" + "st1 { v12.b }[6], [x25]\n" "b 52f\n" "49:" // Height 2: Partial direct writeback: partial_1_4 "tbz x10, #0, 52f\n" "st1 { v8.b }[4], [x11]\n" - "st1 { v12.b }[4], [x24]\n" + "st1 { v12.b }[4], [x25]\n" "b 52f\n" "50:" // Height 2: Partial direct writeback: partial_2_0 "tbz x10, #1, 51f\n" "str h8, [x11], #0x2\n" - "str h12, [x24], #0x2\n" + "str h12, [x25], #0x2\n" "tbz x10, #0, 52f\n" "st1 { v8.b }[2], [x11]\n" - "st1 { v12.b }[2], [x24]\n" + "st1 { v12.b }[2], [x25]\n" "b 52f\n" "51:" // Height 2: Partial direct writeback: partial_1_0 "str b8, [x11, #0x0]\n" - "str b12, [x24, #0x0]\n" + "str b12, [x25, #0x0]\n" "52:" // Height 2: Partial direct writeback: Done "b 54f\n" "53:" // Height 2: Full writeback "str q8, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q12, [x24, #0x0]\n" + "str q12, [x25, #0x0]\n" "54:" // Height 2: Writeback done "subs x10, x10, #0x10\n" "bgt 29b\n" @@ -802,13 +801,13 @@ void a64_hybrid_s8qs_dot_6x16 ( "58:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -817,8 +816,8 @@ void a64_hybrid_s8qs_dot_6x16 ( "b 60f\n" "59:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "60:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 63f\n" @@ -835,75 +834,75 @@ void a64_hybrid_s8qs_dot_6x16 ( "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q21, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x25, x25, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q20, [x9, #0x30]\n" "add x24, x24, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x9, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x50]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x9, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x9, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x9, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x9, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x9, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x9, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x9, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x9, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x9, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x9, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x9, #0x10]\n" "bge 61b\n" @@ -913,98 +912,98 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q21, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x24, x24, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q20, [x9, #0x30]\n" "sub x27, x27, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x9, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x9, #0x50]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x9, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x9, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x9, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x9, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x9, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x9, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x9, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x9, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x9, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "63:" // Height 3: Multiply loop: Main loop skip "cbz x27, 68f\n" "cmp x27, #0x4\n" "blt 65f\n" "64:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x9, #0x0]\n" + ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" + ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" + "ldr q20, [x9, #0x10]\n" + ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x9, #0x20]\n" + ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" + ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" "bge 64b\n" "65:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 68f\n" @@ -1022,51 +1021,51 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "67:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q21, [x9, #0x0]\n" + "ldr q20, [x9, #0x10]\n" + ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x9, #0x20]\n" + ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" "68:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 58b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q23, [x14, #0x0]\n" + "ldr q22, [x14, #0x10]\n" + "add v8.4s, v8.4s, v23.4s\n" + "add v9.4s, v9.4s, v22.4s\n" + "ldr q21, [x14, #0x20]\n" + "ldr q20, [x14, #0x30]\n" + "add v10.4s, v10.4s, v21.4s\n" + "add v11.4s, v11.4s, v20.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" + "add x25, x11, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x11, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" + "add v12.4s, v12.4s, v23.4s\n" + "add v13.4s, v13.4s, v22.4s\n" + "add v14.4s, v14.4s, v21.4s\n" + "add v15.4s, v15.4s, v20.4s\n" "add x14, x14, #0x40\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v16.4s, v16.4s, v23.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v20.4s\n" "tbz %x[flags], #4, 69f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -1080,10 +1079,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 70f\n" "69:" // Height 3: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1104,55 +1103,55 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v18.4s, v18.4s, v6.4s\n" "sqrdmulh v19.4s, v19.4s, v7.4s\n" "tbz %x[flags], #5, 71f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v8.16b, v0.16b\n" + "and v22.16b, v9.16b, v1.16b\n" + "and v21.16b, v10.16b, v2.16b\n" + "and v20.16b, v11.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v23.4s\n" + "sqadd v9.4s, v9.4s, v22.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sqadd v11.4s, v11.4s, v20.4s\n" + "and v23.16b, v12.16b, v0.16b\n" + "and v22.16b, v13.16b, v1.16b\n" + "and v21.16b, v14.16b, v2.16b\n" + "and v20.16b, v15.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v23.4s\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqadd v14.4s, v14.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v20.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v1.16b\n" + "and v21.16b, v18.16b, v2.16b\n" + "and v20.16b, v19.16b, v3.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "71:" // Height 3: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v22.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v21.4s }, [x20]\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -1160,132 +1159,132 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add v8.4s, v8.4s, v22.4s\n" + "add v9.4s, v9.4s, v22.4s\n" + "add v10.4s, v10.4s, v22.4s\n" + "add v11.4s, v11.4s, v22.4s\n" + "add v12.4s, v12.4s, v22.4s\n" + "add v13.4s, v13.4s, v22.4s\n" + "add v14.4s, v14.4s, v22.4s\n" + "add v15.4s, v15.4s, v22.4s\n" + "add v16.4s, v16.4s, v22.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v22.4s\n" + "add v19.4s, v19.4s, v22.4s\n" + "smin v8.4s, v8.4s, v21.4s\n" + "smin v9.4s, v9.4s, v21.4s\n" + "smin v10.4s, v10.4s, v21.4s\n" + "smin v11.4s, v11.4s, v21.4s\n" + "smin v12.4s, v12.4s, v21.4s\n" + "smin v13.4s, v13.4s, v21.4s\n" + "smin v14.4s, v14.4s, v21.4s\n" + "smin v15.4s, v15.4s, v21.4s\n" + "smin v16.4s, v16.4s, v21.4s\n" + "smin v17.4s, v17.4s, v21.4s\n" + "smin v18.4s, v18.4s, v21.4s\n" + "smin v19.4s, v19.4s, v21.4s\n" + "smax v8.4s, v8.4s, v20.4s\n" + "smax v9.4s, v9.4s, v20.4s\n" + "smax v10.4s, v10.4s, v20.4s\n" + "smax v11.4s, v11.4s, v20.4s\n" + "smax v12.4s, v12.4s, v20.4s\n" + "smax v13.4s, v13.4s, v20.4s\n" + "smax v14.4s, v14.4s, v20.4s\n" + "smax v15.4s, v15.4s, v20.4s\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v21.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v20.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" + "uzp1 v8.16b, v8.16b, v21.16b\n" + "uzp1 v12.16b, v12.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 80f\n" "tbz x10, #3, 75f\n" "str d8, [x11], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" "tbz x10, #2, 73f\n" "st1 { v8.s }[2], [x11], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" "tbz x10, #1, 72f\n" "st1 { v8.h }[6], [x11], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" "tbz x10, #0, 79f\n" "st1 { v8.b }[14], [x11]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" "b 79f\n" "72:" // Height 3: Partial direct writeback: partial_1_12 "tbz x10, #0, 79f\n" "st1 { v8.b }[12], [x11]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" "b 79f\n" "73:" // Height 3: Partial direct writeback: partial_2_8 "tbz x10, #1, 74f\n" "st1 { v8.h }[4], [x11], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" "tbz x10, #0, 79f\n" "st1 { v8.b }[10], [x11]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" "b 79f\n" "74:" // Height 3: Partial direct writeback: partial_1_8 "tbz x10, #0, 79f\n" "st1 { v8.b }[8], [x11]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" "b 79f\n" "75:" // Height 3: Partial direct writeback: partial_4_0 "tbz x10, #2, 77f\n" "str s8, [x11], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" "tbz x10, #1, 76f\n" "st1 { v8.h }[2], [x11], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" "tbz x10, #0, 79f\n" "st1 { v8.b }[6], [x11]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" "b 79f\n" "76:" // Height 3: Partial direct writeback: partial_1_4 "tbz x10, #0, 79f\n" "st1 { v8.b }[4], [x11]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" "b 79f\n" "77:" // Height 3: Partial direct writeback: partial_2_0 "tbz x10, #1, 78f\n" "str h8, [x11], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" "tbz x10, #0, 79f\n" "st1 { v8.b }[2], [x11]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" "b 79f\n" "78:" // Height 3: Partial direct writeback: partial_1_0 "str b8, [x11, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" "79:" // Height 3: Partial direct writeback: Done "b 81f\n" "80:" // Height 3: Full writeback "str q8, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" "81:" // Height 3: Writeback done "subs x10, x10, #0x10\n" "bgt 56b\n" @@ -1319,14 +1318,14 @@ void a64_hybrid_s8qs_dot_6x16 ( "85:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 86f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 87f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1336,9 +1335,9 @@ void a64_hybrid_s8qs_dot_6x16 ( "b 87f\n" "86:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "87:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 90f\n" @@ -1357,7 +1356,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q25, [x9, #0x20]\n" "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1365,85 +1364,85 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x23, x23, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q24, [x9, #0x30]\n" "cmp x27, #0x20\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x40]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x9, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x9, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x9, #0x10]\n" "bge 88b\n" @@ -1454,7 +1453,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x25, x25, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q25, [x9, #0x20]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1462,112 +1461,112 @@ void a64_hybrid_s8qs_dot_6x16 ( "sub x27, x27, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q24, [x9, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x40]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x9, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x9, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "90:" // Height 4: Multiply loop: Main loop skip "cbz x27, 95f\n" "cmp x27, #0x4\n" "blt 92f\n" "91:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x9, #0x0]\n" + "ldr q24, [x9, #0x10]\n" + ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" + ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" + ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" "bge 91b\n" "92:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 95f\n" @@ -1588,61 +1587,61 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" "94:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q25, [x9, #0x0]\n" + "ldr q24, [x9, #0x10]\n" + ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" "95:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 85b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q27, [x14, #0x0]\n" + "ldr q26, [x14, #0x10]\n" + "add v8.4s, v8.4s, v27.4s\n" + "add v9.4s, v9.4s, v26.4s\n" + "ldr q25, [x14, #0x20]\n" + "ldr q24, [x14, #0x30]\n" + "add v10.4s, v10.4s, v25.4s\n" + "add v11.4s, v11.4s, v24.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" + "add x25, x11, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x11, #0x0]\n" - "add x22, x23, x20\n" + "add x23, x24, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" + "add v12.4s, v12.4s, v27.4s\n" "prfm pstl1keep, [x23, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x22, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" + "add v13.4s, v13.4s, v26.4s\n" + "add v14.4s, v14.4s, v25.4s\n" "add x14, x14, #0x40\n" - "add v15.4s, v15.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v15.4s, v15.4s, v24.4s\n" + "add v16.4s, v16.4s, v27.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v27.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v23.4s, v23.4s, v24.4s\n" "tbz %x[flags], #4, 96f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -1656,10 +1655,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 97f\n" "96:" // Height 4: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1684,67 +1683,67 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v22.4s, v22.4s, v6.4s\n" "sqrdmulh v23.4s, v23.4s, v7.4s\n" "tbz %x[flags], #5, 98f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" + "and v27.16b, v8.16b, v0.16b\n" + "and v26.16b, v9.16b, v1.16b\n" + "and v25.16b, v10.16b, v2.16b\n" + "and v24.16b, v11.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "sqadd v9.4s, v9.4s, v26.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v24.4s\n" + "and v27.16b, v12.16b, v0.16b\n" + "and v26.16b, v13.16b, v1.16b\n" + "and v25.16b, v14.16b, v2.16b\n" + "and v24.16b, v15.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v27.4s\n" + "sqadd v13.4s, v13.4s, v26.4s\n" + "sqadd v14.4s, v14.4s, v25.4s\n" + "sqadd v15.4s, v15.4s, v24.4s\n" + "and v27.16b, v16.16b, v0.16b\n" + "and v26.16b, v17.16b, v1.16b\n" + "and v25.16b, v18.16b, v2.16b\n" + "and v24.16b, v19.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v27.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "sqadd v18.4s, v18.4s, v25.4s\n" + "sqadd v19.4s, v19.4s, v24.4s\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v1.16b\n" + "and v25.16b, v22.16b, v2.16b\n" + "and v24.16b, v23.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "98:" // Height 4: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -1756,163 +1755,163 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v21.4s, v21.4s, v1.4s\n" "srshl v22.4s, v22.4s, v2.4s\n" "srshl v23.4s, v23.4s, v3.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add v8.4s, v8.4s, v26.4s\n" + "add v9.4s, v9.4s, v26.4s\n" + "add v10.4s, v10.4s, v26.4s\n" + "add v11.4s, v11.4s, v26.4s\n" + "add v12.4s, v12.4s, v26.4s\n" + "add v13.4s, v13.4s, v26.4s\n" + "add v14.4s, v14.4s, v26.4s\n" + "add v15.4s, v15.4s, v26.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "smin v8.4s, v8.4s, v25.4s\n" + "smin v9.4s, v9.4s, v25.4s\n" + "smin v10.4s, v10.4s, v25.4s\n" + "smin v11.4s, v11.4s, v25.4s\n" + "smin v12.4s, v12.4s, v25.4s\n" + "smin v13.4s, v13.4s, v25.4s\n" + "smin v14.4s, v14.4s, v25.4s\n" + "smin v15.4s, v15.4s, v25.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smax v8.4s, v8.4s, v24.4s\n" + "smax v9.4s, v9.4s, v24.4s\n" + "smax v10.4s, v10.4s, v24.4s\n" + "smax v11.4s, v11.4s, v24.4s\n" + "smax v12.4s, v12.4s, v24.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smax v14.4s, v14.4s, v24.4s\n" + "smax v15.4s, v15.4s, v24.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v25.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v24.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" + "uzp1 v8.16b, v8.16b, v25.16b\n" + "uzp1 v12.16b, v12.16b, v24.16b\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 107f\n" "tbz x10, #3, 102f\n" "str d8, [x11], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" "tbz x10, #2, 100f\n" "st1 { v8.s }[2], [x11], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" "tbz x10, #1, 99f\n" "st1 { v8.h }[6], [x11], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" "tbz x10, #0, 106f\n" "st1 { v8.b }[14], [x11]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" "b 106f\n" "99:" // Height 4: Partial direct writeback: partial_1_12 "tbz x10, #0, 106f\n" "st1 { v8.b }[12], [x11]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" "b 106f\n" "100:" // Height 4: Partial direct writeback: partial_2_8 "tbz x10, #1, 101f\n" "st1 { v8.h }[4], [x11], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" "tbz x10, #0, 106f\n" "st1 { v8.b }[10], [x11]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" "b 106f\n" "101:" // Height 4: Partial direct writeback: partial_1_8 "tbz x10, #0, 106f\n" "st1 { v8.b }[8], [x11]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" "b 106f\n" "102:" // Height 4: Partial direct writeback: partial_4_0 "tbz x10, #2, 104f\n" "str s8, [x11], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" "tbz x10, #1, 103f\n" "st1 { v8.h }[2], [x11], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" "tbz x10, #0, 106f\n" "st1 { v8.b }[6], [x11]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" "b 106f\n" "103:" // Height 4: Partial direct writeback: partial_1_4 "tbz x10, #0, 106f\n" "st1 { v8.b }[4], [x11]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" "b 106f\n" "104:" // Height 4: Partial direct writeback: partial_2_0 "tbz x10, #1, 105f\n" "str h8, [x11], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" "tbz x10, #0, 106f\n" "st1 { v8.b }[2], [x11]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" "b 106f\n" "105:" // Height 4: Partial direct writeback: partial_1_0 "str b8, [x11, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" "106:" // Height 4: Partial direct writeback: Done "b 108f\n" "107:" // Height 4: Full writeback "str q8, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" "108:" // Height 4: Writeback done "subs x10, x10, #0x10\n" "bgt 83b\n" @@ -1950,15 +1949,15 @@ void a64_hybrid_s8qs_dot_6x16 ( "112:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 113f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 114f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1969,10 +1968,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "b 114f\n" "113:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "114:" // Height 5: input setup done "cmp x27, #0x10\n" "blt 117f\n" @@ -1995,7 +1994,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q29, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2004,100 +2003,100 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x27, #0x20\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q28, [x9, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x9, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x9, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x9, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x9, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x9, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x9, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x9, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x9, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x9, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x9, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x9, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" "ldr q6, [x9, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x9, #0x10]\n" "bge 115b\n" @@ -2111,7 +2110,7 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x20]\n" + "ldr q29, [x9, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2120,131 +2119,131 @@ void a64_hybrid_s8qs_dot_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q28, [x9, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x9, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x9, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x9, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x9, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x9, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x9, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x9, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x9, #0xf0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x9, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x9, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x9, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x9, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x9, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x9, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x9, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x9, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x9, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x9, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x9, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" "117:" // Height 5: Multiply loop: Main loop skip "cbz x27, 122f\n" "cmp x27, #0x4\n" "blt 119f\n" "118:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x9, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x9, #0x0]\n" + ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + "ldr q28, [x9, #0x10]\n" + ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x9, #0x20]\n" + ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" "bge 118b\n" "119:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 122f\n" @@ -2268,71 +2267,71 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr b3, [x23, #0x0]\n" "ldr b4, [x22, #0x0]\n" "121:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q29, [x9, #0x0]\n" + "ldr q28, [x9, #0x10]\n" + ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x9, #0x20]\n" + ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" "122:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 112b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q31, [x14, #0x0]\n" + "ldr q30, [x14, #0x10]\n" + "add v8.4s, v8.4s, v31.4s\n" + "add v9.4s, v9.4s, v30.4s\n" + "ldr q29, [x14, #0x20]\n" + "ldr q28, [x14, #0x30]\n" + "add v10.4s, v10.4s, v29.4s\n" + "add v11.4s, v11.4s, v28.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" + "add x25, x11, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x11, #0x0]\n" + "add x23, x24, x20\n" "add x22, x23, x20\n" - "add x21, x22, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" - "add v15.4s, v15.4s, v3.4s\n" + "add v12.4s, v12.4s, v31.4s\n" + "add v13.4s, v13.4s, v30.4s\n" + "add v14.4s, v14.4s, v29.4s\n" + "add v15.4s, v15.4s, v28.4s\n" "add x14, x14, #0x40\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "add v16.4s, v16.4s, v31.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v31.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v31.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" "tbz %x[flags], #4, 123f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -2346,10 +2345,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 124f\n" "123:" // Height 5: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -2378,79 +2377,79 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v27.4s, v27.4s, v7.4s\n" "tbz %x[flags], #5, 125f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" + "and v31.16b, v8.16b, v0.16b\n" + "and v30.16b, v9.16b, v1.16b\n" + "and v29.16b, v10.16b, v2.16b\n" + "and v28.16b, v11.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v31.4s\n" + "sqadd v9.4s, v9.4s, v30.4s\n" + "sqadd v10.4s, v10.4s, v29.4s\n" + "sqadd v11.4s, v11.4s, v28.4s\n" + "and v31.16b, v12.16b, v0.16b\n" + "and v30.16b, v13.16b, v1.16b\n" + "and v29.16b, v14.16b, v2.16b\n" + "and v28.16b, v15.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v31.4s\n" + "sqadd v13.4s, v13.4s, v30.4s\n" + "sqadd v14.4s, v14.4s, v29.4s\n" + "sqadd v15.4s, v15.4s, v28.4s\n" + "and v31.16b, v16.16b, v0.16b\n" + "and v30.16b, v17.16b, v1.16b\n" + "and v29.16b, v18.16b, v2.16b\n" + "and v28.16b, v19.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v31.4s\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "and v31.16b, v20.16b, v0.16b\n" + "and v30.16b, v21.16b, v1.16b\n" + "and v29.16b, v22.16b, v2.16b\n" + "and v28.16b, v23.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v31.4s\n" + "sqadd v21.4s, v21.4s, v30.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sqadd v23.4s, v23.4s, v28.4s\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v1.16b\n" + "and v29.16b, v26.16b, v2.16b\n" + "and v28.16b, v27.16b, v3.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "125:" // Height 5: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v30.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -2466,194 +2465,194 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v8.4s, v8.4s, v30.4s\n" + "add v9.4s, v9.4s, v30.4s\n" + "add v10.4s, v10.4s, v30.4s\n" + "add v11.4s, v11.4s, v30.4s\n" + "add v12.4s, v12.4s, v30.4s\n" + "add v13.4s, v13.4s, v30.4s\n" + "add v14.4s, v14.4s, v30.4s\n" + "add v15.4s, v15.4s, v30.4s\n" + "add v16.4s, v16.4s, v30.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v30.4s\n" + "add v19.4s, v19.4s, v30.4s\n" + "add v20.4s, v20.4s, v30.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v30.4s\n" + "add v23.4s, v23.4s, v30.4s\n" + "add v24.4s, v24.4s, v30.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v30.4s\n" + "add v27.4s, v27.4s, v30.4s\n" + "smin v8.4s, v8.4s, v29.4s\n" + "smin v9.4s, v9.4s, v29.4s\n" + "smin v10.4s, v10.4s, v29.4s\n" + "smin v11.4s, v11.4s, v29.4s\n" + "smin v12.4s, v12.4s, v29.4s\n" + "smin v13.4s, v13.4s, v29.4s\n" + "smin v14.4s, v14.4s, v29.4s\n" + "smin v15.4s, v15.4s, v29.4s\n" + "smin v16.4s, v16.4s, v29.4s\n" + "smin v17.4s, v17.4s, v29.4s\n" + "smin v18.4s, v18.4s, v29.4s\n" + "smin v19.4s, v19.4s, v29.4s\n" + "smin v20.4s, v20.4s, v29.4s\n" + "smin v21.4s, v21.4s, v29.4s\n" + "smin v22.4s, v22.4s, v29.4s\n" + "smin v23.4s, v23.4s, v29.4s\n" + "smin v24.4s, v24.4s, v29.4s\n" + "smin v25.4s, v25.4s, v29.4s\n" + "smin v26.4s, v26.4s, v29.4s\n" + "smin v27.4s, v27.4s, v29.4s\n" + "smax v8.4s, v8.4s, v28.4s\n" + "smax v9.4s, v9.4s, v28.4s\n" + "smax v10.4s, v10.4s, v28.4s\n" + "smax v11.4s, v11.4s, v28.4s\n" + "smax v12.4s, v12.4s, v28.4s\n" + "smax v13.4s, v13.4s, v28.4s\n" + "smax v14.4s, v14.4s, v28.4s\n" + "smax v15.4s, v15.4s, v28.4s\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v29.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v28.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v8.16b, v8.16b, v29.16b\n" + "uzp1 v12.16b, v12.16b, v28.16b\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 134f\n" "tbz x10, #3, 129f\n" "str d8, [x11], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x10, #2, 127f\n" "st1 { v8.s }[2], [x11], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x10, #1, 126f\n" "st1 { v8.h }[6], [x11], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x10, #0, 133f\n" "st1 { v8.b }[14], [x11]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 133f\n" "126:" // Height 5: Partial direct writeback: partial_1_12 "tbz x10, #0, 133f\n" "st1 { v8.b }[12], [x11]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 133f\n" "127:" // Height 5: Partial direct writeback: partial_2_8 "tbz x10, #1, 128f\n" "st1 { v8.h }[4], [x11], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x10, #0, 133f\n" "st1 { v8.b }[10], [x11]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 133f\n" "128:" // Height 5: Partial direct writeback: partial_1_8 "tbz x10, #0, 133f\n" "st1 { v8.b }[8], [x11]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 133f\n" "129:" // Height 5: Partial direct writeback: partial_4_0 "tbz x10, #2, 131f\n" "str s8, [x11], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x10, #1, 130f\n" "st1 { v8.h }[2], [x11], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x10, #0, 133f\n" "st1 { v8.b }[6], [x11]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 133f\n" "130:" // Height 5: Partial direct writeback: partial_1_4 "tbz x10, #0, 133f\n" "st1 { v8.b }[4], [x11]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 133f\n" "131:" // Height 5: Partial direct writeback: partial_2_0 "tbz x10, #1, 132f\n" "str h8, [x11], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x10, #0, 133f\n" "st1 { v8.b }[2], [x11]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 133f\n" "132:" // Height 5: Partial direct writeback: partial_1_0 "str b8, [x11, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "133:" // Height 5: Partial direct writeback: Done "b 135f\n" "134:" // Height 5: Full writeback "str q8, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "135:" // Height 5: Writeback done "subs x10, x10, #0x10\n" "bgt 110b\n" @@ -2698,16 +2697,16 @@ void a64_hybrid_s8qs_dot_6x16 ( "139:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 140f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 141f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2719,11 +2718,11 @@ void a64_hybrid_s8qs_dot_6x16 ( "b 141f\n" "140:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "141:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 144f\n" @@ -3002,43 +3001,43 @@ void a64_hybrid_s8qs_dot_6x16 ( "cmp x27, #0x4\n" "blt 146f\n" "145:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x9, #0x0]\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" + ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x9, #0x20]\n" + ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" + ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" "bge 145b\n" "146:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 149f\n" @@ -3065,81 +3064,81 @@ void a64_hybrid_s8qs_dot_6x16 ( "ldr b4, [x22, #0x0]\n" "ldr b5, [x21, #0x0]\n" "148:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x9, #0x30]\n" + "ldr q7, [x9, #0x0]\n" + "ldr q6, [x9, #0x10]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x9, #0x20]\n" + ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x9, #0x30]\n" "add x9, x9, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" "149:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 139b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "ldr q3, [x14, #0x0]\n" + "ldr q2, [x14, #0x10]\n" + "add v8.4s, v8.4s, v3.4s\n" + "add v9.4s, v9.4s, v2.4s\n" + "ldr q1, [x14, #0x20]\n" + "ldr q0, [x14, #0x30]\n" + "add v10.4s, v10.4s, v1.4s\n" + "add v11.4s, v11.4s, v0.4s\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" + "add x25, x11, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x11, #0x0]\n" + "add x23, x24, x20\n" "add x22, x23, x20\n" - "add x21, x22, x20\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" + "add x21, x22, x20\n" "prfm pstl1keep, [x23, #0x0]\n" - "add x20, x21, x20\n" "prfm pstl1keep, [x22, #0x0]\n" + "add v12.4s, v12.4s, v3.4s\n" "prfm pstl1keep, [x21, #0x0]\n" - "add v12.4s, v12.4s, v0.4s\n" - "prfm pstl1keep, [x20, #0x0]\n" - "add v13.4s, v13.4s, v1.4s\n" - "add v14.4s, v14.4s, v2.4s\n" + "add v13.4s, v13.4s, v2.4s\n" + "add v14.4s, v14.4s, v1.4s\n" "add x14, x14, #0x40\n" - "add v15.4s, v15.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v0.4s\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" + "add v15.4s, v15.4s, v0.4s\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v2.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v1.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v2.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v1.4s\n" + "add v31.4s, v31.4s, v0.4s\n" "tbz %x[flags], #4, 150f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -3153,10 +3152,10 @@ void a64_hybrid_s8qs_dot_6x16 ( "add x13, x13, #0x40\n" "b 151f\n" "150:" // Height 6: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -3189,91 +3188,91 @@ void a64_hybrid_s8qs_dot_6x16 ( "sqrdmulh v30.4s, v30.4s, v6.4s\n" "sqrdmulh v31.4s, v31.4s, v7.4s\n" "tbz %x[flags], #5, 152f\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v7.16b, v8.16b, v0.16b\n" + "and v6.16b, v9.16b, v1.16b\n" + "and v5.16b, v10.16b, v2.16b\n" + "and v4.16b, v11.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v12.16b, v0.16b\n" - "and v5.16b, v13.16b, v1.16b\n" - "and v6.16b, v14.16b, v2.16b\n" - "and v7.16b, v15.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v12.4s, v12.4s, v4.4s\n" - "sqadd v13.4s, v13.4s, v5.4s\n" - "sqadd v14.4s, v14.4s, v6.4s\n" - "sqadd v15.4s, v15.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v20.16b, v0.16b\n" - "and v5.16b, v21.16b, v1.16b\n" - "and v6.16b, v22.16b, v2.16b\n" - "and v7.16b, v23.16b, v3.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v7.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "sqadd v10.4s, v10.4s, v5.4s\n" + "sqadd v11.4s, v11.4s, v4.4s\n" + "and v7.16b, v12.16b, v0.16b\n" + "and v6.16b, v13.16b, v1.16b\n" + "and v5.16b, v14.16b, v2.16b\n" + "and v4.16b, v15.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "sqadd v21.4s, v21.4s, v5.4s\n" - "sqadd v22.4s, v22.4s, v6.4s\n" - "sqadd v23.4s, v23.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v12.4s, v12.4s, v7.4s\n" + "sqadd v13.4s, v13.4s, v6.4s\n" + "sqadd v14.4s, v14.4s, v5.4s\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "and v7.16b, v16.16b, v0.16b\n" + "and v6.16b, v17.16b, v1.16b\n" + "and v5.16b, v18.16b, v2.16b\n" + "and v4.16b, v19.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" - "and v4.16b, v28.16b, v0.16b\n" - "and v5.16b, v29.16b, v1.16b\n" - "and v6.16b, v30.16b, v2.16b\n" - "and v7.16b, v31.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v7.4s\n" + "sqadd v17.4s, v17.4s, v6.4s\n" + "sqadd v18.4s, v18.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "and v7.16b, v20.16b, v0.16b\n" + "and v6.16b, v21.16b, v1.16b\n" + "and v5.16b, v22.16b, v2.16b\n" + "and v4.16b, v23.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v7.4s\n" + "sqadd v21.4s, v21.4s, v6.4s\n" + "sqadd v22.4s, v22.4s, v5.4s\n" + "sqadd v23.4s, v23.4s, v4.4s\n" + "and v7.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v1.16b\n" + "and v5.16b, v26.16b, v2.16b\n" + "and v4.16b, v27.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v7.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "sqadd v27.4s, v27.4s, v4.4s\n" + "and v7.16b, v28.16b, v0.16b\n" + "and v6.16b, v29.16b, v1.16b\n" + "and v5.16b, v30.16b, v2.16b\n" + "and v4.16b, v31.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v4.4s\n" - "sqadd v29.4s, v29.4s, v5.4s\n" - "sqadd v30.4s, v30.4s, v6.4s\n" - "sqadd v31.4s, v31.4s, v7.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v7.4s\n" + "sqadd v29.4s, v29.4s, v6.4s\n" + "sqadd v30.4s, v30.4s, v5.4s\n" + "sqadd v31.4s, v31.4s, v4.4s\n" "152:" // Height 6: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v6.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x20]\n" "srshl v12.4s, v12.4s, v0.4s\n" "srshl v13.4s, v13.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x20]\n" "srshl v14.4s, v14.4s, v2.4s\n" "srshl v15.4s, v15.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -3293,225 +3292,225 @@ void a64_hybrid_s8qs_dot_6x16 ( "srshl v29.4s, v29.4s, v1.4s\n" "srshl v30.4s, v30.4s, v2.4s\n" "srshl v31.4s, v31.4s, v3.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add v8.4s, v8.4s, v6.4s\n" + "add v9.4s, v9.4s, v6.4s\n" + "add v10.4s, v10.4s, v6.4s\n" + "add v11.4s, v11.4s, v6.4s\n" + "add v12.4s, v12.4s, v6.4s\n" + "add v13.4s, v13.4s, v6.4s\n" + "add v14.4s, v14.4s, v6.4s\n" + "add v15.4s, v15.4s, v6.4s\n" + "add v16.4s, v16.4s, v6.4s\n" + "add v17.4s, v17.4s, v6.4s\n" + "add v18.4s, v18.4s, v6.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "add v20.4s, v20.4s, v6.4s\n" + "add v21.4s, v21.4s, v6.4s\n" + "add v22.4s, v22.4s, v6.4s\n" + "add v23.4s, v23.4s, v6.4s\n" + "add v24.4s, v24.4s, v6.4s\n" + "add v25.4s, v25.4s, v6.4s\n" + "add v26.4s, v26.4s, v6.4s\n" + "add v27.4s, v27.4s, v6.4s\n" + "add v28.4s, v28.4s, v6.4s\n" + "add v29.4s, v29.4s, v6.4s\n" + "add v30.4s, v30.4s, v6.4s\n" + "add v31.4s, v31.4s, v6.4s\n" + "smin v8.4s, v8.4s, v5.4s\n" + "smin v9.4s, v9.4s, v5.4s\n" + "smin v10.4s, v10.4s, v5.4s\n" + "smin v11.4s, v11.4s, v5.4s\n" + "smin v12.4s, v12.4s, v5.4s\n" + "smin v13.4s, v13.4s, v5.4s\n" + "smin v14.4s, v14.4s, v5.4s\n" + "smin v15.4s, v15.4s, v5.4s\n" + "smin v16.4s, v16.4s, v5.4s\n" + "smin v17.4s, v17.4s, v5.4s\n" + "smin v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v5.4s\n" + "smin v20.4s, v20.4s, v5.4s\n" + "smin v21.4s, v21.4s, v5.4s\n" + "smin v22.4s, v22.4s, v5.4s\n" + "smin v23.4s, v23.4s, v5.4s\n" + "smin v24.4s, v24.4s, v5.4s\n" + "smin v25.4s, v25.4s, v5.4s\n" + "smin v26.4s, v26.4s, v5.4s\n" + "smin v27.4s, v27.4s, v5.4s\n" + "smin v28.4s, v28.4s, v5.4s\n" + "smin v29.4s, v29.4s, v5.4s\n" + "smin v30.4s, v30.4s, v5.4s\n" + "smin v31.4s, v31.4s, v5.4s\n" + "smax v8.4s, v8.4s, v4.4s\n" + "smax v9.4s, v9.4s, v4.4s\n" + "smax v10.4s, v10.4s, v4.4s\n" + "smax v11.4s, v11.4s, v4.4s\n" + "smax v12.4s, v12.4s, v4.4s\n" + "smax v13.4s, v13.4s, v4.4s\n" + "smax v14.4s, v14.4s, v4.4s\n" + "smax v15.4s, v15.4s, v4.4s\n" + "smax v16.4s, v16.4s, v4.4s\n" + "smax v17.4s, v17.4s, v4.4s\n" + "smax v18.4s, v18.4s, v4.4s\n" + "smax v19.4s, v19.4s, v4.4s\n" + "smax v20.4s, v20.4s, v4.4s\n" + "smax v21.4s, v21.4s, v4.4s\n" + "smax v22.4s, v22.4s, v4.4s\n" + "smax v23.4s, v23.4s, v4.4s\n" + "smax v24.4s, v24.4s, v4.4s\n" + "smax v25.4s, v25.4s, v4.4s\n" + "smax v26.4s, v26.4s, v4.4s\n" + "smax v27.4s, v27.4s, v4.4s\n" + "smax v28.4s, v28.4s, v4.4s\n" + "smax v29.4s, v29.4s, v4.4s\n" + "smax v30.4s, v30.4s, v4.4s\n" + "smax v31.4s, v31.4s, v4.4s\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v2.8h, v10.8h, v11.8h\n" "uzp1 v12.8h, v12.8h, v13.8h\n" - "uzp1 v13.8h, v14.8h, v15.8h\n" + "uzp1 v1.8h, v14.8h, v15.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" - "uzp1 v12.16b, v12.16b, v13.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" + "uzp1 v8.16b, v8.16b, v2.16b\n" + "uzp1 v12.16b, v12.16b, v1.16b\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 161f\n" "tbz x10, #3, 156f\n" "str d8, [x11], #0x8\n" - "str d12, [x24], #0x8\n" - "str d16, [x23], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" + "str d12, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" "tbz x10, #2, 154f\n" "st1 { v8.s }[2], [x11], #0x4\n" - "st1 { v12.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" + "st1 { v12.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" "tbz x10, #1, 153f\n" "st1 { v8.h }[6], [x11], #0x2\n" - "st1 { v12.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" + "st1 { v12.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" "tbz x10, #0, 160f\n" "st1 { v8.b }[14], [x11]\n" - "st1 { v12.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "st1 { v12.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 160f\n" "153:" // Height 6: Partial direct writeback: partial_1_12 "tbz x10, #0, 160f\n" "st1 { v8.b }[12], [x11]\n" - "st1 { v12.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "st1 { v12.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 160f\n" "154:" // Height 6: Partial direct writeback: partial_2_8 "tbz x10, #1, 155f\n" "st1 { v8.h }[4], [x11], #0x2\n" - "st1 { v12.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" + "st1 { v12.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" "tbz x10, #0, 160f\n" "st1 { v8.b }[10], [x11]\n" - "st1 { v12.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "st1 { v12.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 160f\n" "155:" // Height 6: Partial direct writeback: partial_1_8 "tbz x10, #0, 160f\n" "st1 { v8.b }[8], [x11]\n" - "st1 { v12.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "st1 { v12.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 160f\n" "156:" // Height 6: Partial direct writeback: partial_4_0 "tbz x10, #2, 158f\n" "str s8, [x11], #0x4\n" - "str s12, [x24], #0x4\n" - "str s16, [x23], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" + "str s12, [x25], #0x4\n" + "str s16, [x24], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" "tbz x10, #1, 157f\n" "st1 { v8.h }[2], [x11], #0x2\n" - "st1 { v12.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" + "st1 { v12.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" "tbz x10, #0, 160f\n" "st1 { v8.b }[6], [x11]\n" - "st1 { v12.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "st1 { v12.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 160f\n" "157:" // Height 6: Partial direct writeback: partial_1_4 "tbz x10, #0, 160f\n" "st1 { v8.b }[4], [x11]\n" - "st1 { v12.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "st1 { v12.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 160f\n" "158:" // Height 6: Partial direct writeback: partial_2_0 "tbz x10, #1, 159f\n" "str h8, [x11], #0x2\n" - "str h12, [x24], #0x2\n" - "str h16, [x23], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" + "str h12, [x25], #0x2\n" + "str h16, [x24], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" "tbz x10, #0, 160f\n" "st1 { v8.b }[2], [x11]\n" - "st1 { v12.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "st1 { v12.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 160f\n" "159:" // Height 6: Partial direct writeback: partial_1_0 "str b8, [x11, #0x0]\n" - "str b12, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b12, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "160:" // Height 6: Partial direct writeback: Done "b 162f\n" "161:" // Height 6: Full writeback "str q8, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q12, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q12, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "162:" // Height 6: Writeback done "subs x10, x10, #0x10\n" "bgt 137b\n" @@ -3527,7 +3526,6 @@ void a64_hybrid_s8qs_dot_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "164:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp index 7eacdceae7..d0d5f1b80d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -98,5 +98,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp index fc525531b2..0771829d37 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp @@ -85,7 +85,6 @@ void a64_hybrid_s8qs_mmla_6x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 146f\n" @@ -115,11 +114,11 @@ void a64_hybrid_s8qs_mmla_6x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -135,41 +134,41 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q6, [x9, #0x10]\n" "blt 8f\n" "7:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "trn1 v18.2d, v1.2d, v21.2d\n" + ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" + "trn2 v1.2d, v1.2d, v21.2d\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "cmp x27, #0x20\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" + ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" "ldr q1, [x26, #0x0]\n" "add x9, x9, #0x100\n" "ldr q7, [x9, #0x0]\n" @@ -177,40 +176,40 @@ void a64_hybrid_s8qs_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "bge 7b\n" "8:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "trn1 v18.2d, v1.2d, v19.2d\n" + ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" + "trn2 v1.2d, v1.2d, v19.2d\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" + ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" "prfm pldl1keep, [x26, #0x80]\n" "add x9, x9, #0x100\n" "9:" // Height 1: Multiply loop: Main loop skip @@ -218,26 +217,26 @@ void a64_hybrid_s8qs_mmla_6x16 ( "cmp x27, #0x8\n" "blt 11f\n" "10:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x9, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x70]\n" + "ldr d18, [x26], #0x8\n" + "ldr q17, [x9, #0x0]\n" + "trn1 v18.2d, v18.2d, v16.2d\n" + "ldr q31, [x9, #0x10]\n" + ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e9fa64c // smmla v12.4s, v18.16b, v31.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" "add x9, x9, #0x80\n" "bge 10b\n" "11:" // Height 1: Multiply loop: Skip odd blocks @@ -262,44 +261,44 @@ void a64_hybrid_s8qs_mmla_6x16 ( "14:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "15:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x9, #0x0]\n" - "ldr q6, [x9, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q17, [x9, #0x0]\n" + "ldr q19, [x9, #0x10]\n" + "trn1 v18.2d, v1.2d, v16.2d\n" + ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e93a64c // smmla v12.4s, v18.16b, v19.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" "add x9, x9, #0x80\n" "16:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" + "ldr q19, [x14, #0x0]\n" + "ldr q18, [x14, #0x10]\n" "uzp1 v8.2d, v8.2d, v12.2d\n" "uzp1 v9.2d, v9.2d, v13.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q17, [x14, #0x20]\n" + "ldr q16, [x14, #0x30]\n" "uzp1 v10.2d, v10.2d, v14.2d\n" "uzp1 v11.2d, v11.2d, v15.2d\n" "mov v15.16b, v8.16b\n" "prfm pstl1keep, [x11, #0x0]\n" - "add v15.4s, v15.4s, v0.4s\n" + "add v15.4s, v15.4s, v19.4s\n" "add x14, x14, #0x40\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add v10.4s, v10.4s, v17.4s\n" + "add v11.4s, v11.4s, v16.4s\n" "tbz %x[flags], #4, 17f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -313,10 +312,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 18f\n" "17:" // Height 1: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -329,45 +328,45 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v10.4s, v10.4s, v6.4s\n" "sqrdmulh v11.4s, v11.4s, v7.4s\n" "tbz %x[flags], #5, 19f\n" - "and v4.16b, v15.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" + "and v17.16b, v15.16b, v0.16b\n" + "and v16.16b, v9.16b, v1.16b\n" + "and v25.16b, v10.16b, v2.16b\n" + "and v18.16b, v11.16b, v3.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v17.4s\n" + "sqadd v9.4s, v9.4s, v16.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v18.4s\n" "19:" // Height 1: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v18.4s }, [x20]\n" "srshl v15.4s, v15.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v17.4s }, [x20]\n" + "add v15.4s, v15.4s, v18.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "add v10.4s, v10.4s, v18.4s\n" + "add v11.4s, v11.4s, v18.4s\n" "cmp x10, #0x10\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" + "smin v15.4s, v15.4s, v17.4s\n" + "smin v9.4s, v9.4s, v17.4s\n" + "smin v10.4s, v10.4s, v17.4s\n" + "smin v11.4s, v11.4s, v17.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" "uzp1 v15.8h, v15.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" - "uzp1 v15.16b, v15.16b, v9.16b\n" + "uzp1 v16.8h, v10.8h, v11.8h\n" + "uzp1 v15.16b, v15.16b, v16.16b\n" "bge 28f\n" "tbz x10, #3, 23f\n" "str d15, [x11], #0x8\n" @@ -442,12 +441,12 @@ void a64_hybrid_s8qs_mmla_6x16 ( "33:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 34f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -455,7 +454,7 @@ void a64_hybrid_s8qs_mmla_6x16 ( "b 35f\n" "34:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "35:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 38f\n" @@ -466,85 +465,85 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q6, [x9, #0x10]\n" "blt 37f\n" "36:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" + "trn1 v18.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" "add x9, x9, #0x100\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x9, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "bge 36b\n" "37:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" + "trn1 v18.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a648 // smmla v8.4s, v18.16b, v7.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e86a64c // smmla v12.4s, v18.16b, v6.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x80]\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x90]\n" + ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xa0]\n" + ".inst 0x4e90a42c // smmla v12.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xb0]\n" + ".inst 0x4e91a429 // smmla v9.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xc0]\n" + ".inst 0x4e90a42d // smmla v13.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xd0]\n" + ".inst 0x4e91a42a // smmla v10.4s, v1.16b, v17.16b\n" + "ldr q17, [x9, #0xe0]\n" + ".inst 0x4e90a42e // smmla v14.4s, v1.16b, v16.16b\n" + "ldr q16, [x9, #0xf0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e91a42b // smmla v11.4s, v1.16b, v17.16b\n" + ".inst 0x4e90a42f // smmla v15.4s, v1.16b, v16.16b\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" @@ -554,27 +553,27 @@ void a64_hybrid_s8qs_mmla_6x16 ( "cmp x27, #0x8\n" "blt 40f\n" "39:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d17, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "trn1 v18.2d, v17.2d, v16.2d\n" "sub x27, x27, #0x8\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - "ldr q6, [x9, #0x20]\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - "ldr q6, [x9, #0x40]\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - "ldr q6, [x9, #0x60]\n" - "ldr q7, [x9, #0x70]\n" + "ldr q17, [x9, #0x0]\n" + "ldr q16, [x9, #0x10]\n" + ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n" + "ldr q17, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q17, [x9, #0x40]\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q17, [x9, #0x60]\n" + "ldr q16, [x9, #0x70]\n" "cmp x27, #0x8\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" "add x9, x9, #0x80\n" "bge 39b\n" "40:" // Height 2: Multiply loop: Skip odd blocks @@ -606,55 +605,55 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "44:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x9, #0x0]\n" - "ldr q6, [x9, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q17, [x9, #0x0]\n" + "ldr q16, [x9, #0x10]\n" + "trn1 v18.2d, v1.2d, v2.2d\n" + ".inst 0x4e91a648 // smmla v8.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x20]\n" + ".inst 0x4e90a64c // smmla v12.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x30]\n" + ".inst 0x4e91a649 // smmla v9.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x40]\n" + ".inst 0x4e90a64d // smmla v13.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x50]\n" + ".inst 0x4e91a64a // smmla v10.4s, v18.16b, v17.16b\n" + "ldr q17, [x9, #0x60]\n" + ".inst 0x4e90a64e // smmla v14.4s, v18.16b, v16.16b\n" + "ldr q16, [x9, #0x70]\n" + ".inst 0x4e91a64b // smmla v11.4s, v18.16b, v17.16b\n" + ".inst 0x4e90a64f // smmla v15.4s, v18.16b, v16.16b\n" "add x9, x9, #0x80\n" "45:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 33b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr q19, [x14, #0x0]\n" + "ldr q18, [x14, #0x10]\n" + "uzp1 v17.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q5, [x14, #0x20]\n" + "ldr q16, [x14, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp1 v13.2d, v10.2d, v14.2d\n" "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x24, x11, x20\n" + "add x25, x11, x20\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x24, #0x0]\n" - "mov v15.16b, v7.16b\n" - "add v15.4s, v15.4s, v0.4s\n" + "prfm pstl1keep, [x25, #0x0]\n" + "mov v15.16b, v17.16b\n" + "add v15.4s, v15.4s, v19.4s\n" "add x14, x14, #0x40\n" - "add v12.4s, v12.4s, v1.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v3.4s\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" + "add v12.4s, v12.4s, v18.4s\n" + "add v13.4s, v13.4s, v5.4s\n" + "add v14.4s, v14.4s, v16.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add v10.4s, v10.4s, v5.4s\n" + "add v11.4s, v11.4s, v16.4s\n" "tbz %x[flags], #4, 46f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -668,10 +667,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 47f\n" "46:" // Height 2: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -688,141 +687,141 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v10.4s, v10.4s, v6.4s\n" "sqrdmulh v11.4s, v11.4s, v7.4s\n" "tbz %x[flags], #5, 48f\n" - "and v4.16b, v15.16b, v0.16b\n" - "and v5.16b, v12.16b, v1.16b\n" - "and v6.16b, v13.16b, v2.16b\n" - "and v7.16b, v14.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "sqadd v12.4s, v12.4s, v5.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v7.4s\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" + "and v19.16b, v15.16b, v0.16b\n" + "and v18.16b, v12.16b, v1.16b\n" + "and v17.16b, v13.16b, v2.16b\n" + "and v16.16b, v14.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v19.4s\n" + "sqadd v12.4s, v12.4s, v18.4s\n" + "sqadd v13.4s, v13.4s, v17.4s\n" + "sqadd v14.4s, v14.4s, v16.4s\n" + "and v19.16b, v8.16b, v0.16b\n" + "and v18.16b, v9.16b, v1.16b\n" + "and v17.16b, v10.16b, v2.16b\n" + "and v16.16b, v11.16b, v3.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v19.4s\n" + "sqadd v9.4s, v9.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v17.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" "48:" // Height 2: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v18.4s }, [x20]\n" "srshl v15.4s, v15.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v17.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "cmp x10, #0x10\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" + "add v15.4s, v15.4s, v18.4s\n" + "add v12.4s, v12.4s, v18.4s\n" + "add v13.4s, v13.4s, v18.4s\n" + "add v14.4s, v14.4s, v18.4s\n" + "add v8.4s, v8.4s, v18.4s\n" + "add v9.4s, v9.4s, v18.4s\n" + "add v10.4s, v10.4s, v18.4s\n" + "add v11.4s, v11.4s, v18.4s\n" + "smin v15.4s, v15.4s, v17.4s\n" + "smin v12.4s, v12.4s, v17.4s\n" + "smin v13.4s, v13.4s, v17.4s\n" + "smin v14.4s, v14.4s, v17.4s\n" + "smin v8.4s, v8.4s, v17.4s\n" + "smin v9.4s, v9.4s, v17.4s\n" + "smin v10.4s, v10.4s, v17.4s\n" + "smin v11.4s, v11.4s, v17.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smax v12.4s, v12.4s, v16.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v14.4s, v14.4s, v16.4s\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" "uzp1 v15.8h, v15.8h, v12.8h\n" - "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v17.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" - "uzp1 v15.16b, v15.16b, v12.16b\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v16.8h, v10.8h, v11.8h\n" + "uzp1 v15.16b, v15.16b, v17.16b\n" + "uzp1 v8.16b, v8.16b, v16.16b\n" "bge 57f\n" "tbz x10, #3, 52f\n" "str d15, [x11], #0x8\n" - "str d8, [x24], #0x8\n" + "str d8, [x25], #0x8\n" "tbz x10, #2, 50f\n" "st1 { v15.s }[2], [x11], #0x4\n" - "st1 { v8.s }[2], [x24], #0x4\n" + "st1 { v8.s }[2], [x25], #0x4\n" "tbz x10, #1, 49f\n" "st1 { v15.h }[6], [x11], #0x2\n" - "st1 { v8.h }[6], [x24], #0x2\n" + "st1 { v8.h }[6], [x25], #0x2\n" "tbz x10, #0, 56f\n" "st1 { v15.b }[14], [x11]\n" - "st1 { v8.b }[14], [x24]\n" + "st1 { v8.b }[14], [x25]\n" "b 56f\n" "49:" // Height 2: Partial direct writeback: partial_1_12 "tbz x10, #0, 56f\n" "st1 { v15.b }[12], [x11]\n" - "st1 { v8.b }[12], [x24]\n" + "st1 { v8.b }[12], [x25]\n" "b 56f\n" "50:" // Height 2: Partial direct writeback: partial_2_8 "tbz x10, #1, 51f\n" "st1 { v15.h }[4], [x11], #0x2\n" - "st1 { v8.h }[4], [x24], #0x2\n" + "st1 { v8.h }[4], [x25], #0x2\n" "tbz x10, #0, 56f\n" "st1 { v15.b }[10], [x11]\n" - "st1 { v8.b }[10], [x24]\n" + "st1 { v8.b }[10], [x25]\n" "b 56f\n" "51:" // Height 2: Partial direct writeback: partial_1_8 "tbz x10, #0, 56f\n" "st1 { v15.b }[8], [x11]\n" - "st1 { v8.b }[8], [x24]\n" + "st1 { v8.b }[8], [x25]\n" "b 56f\n" "52:" // Height 2: Partial direct writeback: partial_4_0 "tbz x10, #2, 54f\n" "str s15, [x11], #0x4\n" - "str s8, [x24], #0x4\n" + "str s8, [x25], #0x4\n" "tbz x10, #1, 53f\n" "st1 { v15.h }[2], [x11], #0x2\n" - "st1 { v8.h }[2], [x24], #0x2\n" + "st1 { v8.h }[2], [x25], #0x2\n" "tbz x10, #0, 56f\n" "st1 { v15.b }[6], [x11]\n" - "st1 { v8.b }[6], [x24]\n" + "st1 { v8.b }[6], [x25]\n" "b 56f\n" "53:" // Height 2: Partial direct writeback: partial_1_4 "tbz x10, #0, 56f\n" "st1 { v15.b }[4], [x11]\n" - "st1 { v8.b }[4], [x24]\n" + "st1 { v8.b }[4], [x25]\n" "b 56f\n" "54:" // Height 2: Partial direct writeback: partial_2_0 "tbz x10, #1, 55f\n" "str h15, [x11], #0x2\n" - "str h8, [x24], #0x2\n" + "str h8, [x25], #0x2\n" "tbz x10, #0, 56f\n" "st1 { v15.b }[2], [x11]\n" - "st1 { v8.b }[2], [x24]\n" + "st1 { v8.b }[2], [x25]\n" "b 56f\n" "55:" // Height 2: Partial direct writeback: partial_1_0 "str b15, [x11, #0x0]\n" - "str b8, [x24, #0x0]\n" + "str b8, [x25, #0x0]\n" "56:" // Height 2: Partial direct writeback: Done "b 58f\n" "57:" // Height 2: Full writeback "str q15, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q8, [x24, #0x0]\n" + "str q8, [x25, #0x0]\n" "58:" // Height 2: Writeback done "subs x10, x10, #0x10\n" "bgt 31b\n" @@ -856,13 +855,13 @@ void a64_hybrid_s8qs_mmla_6x16 ( "62:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 63f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -871,8 +870,8 @@ void a64_hybrid_s8qs_mmla_6x16 ( "b 64f\n" "63:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "64:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 67f\n" @@ -884,167 +883,167 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q6, [x9, #0x10]\n" "blt 66f\n" "65:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" + "trn1 v26.2d, v3.2d, v28.2d\n" + ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" + "trn2 v3.2d, v3.2d, v28.2d\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" "cmp x27, #0x20\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x9, #0x10]\n" "bge 65b\n" "66:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" + "trn1 v26.2d, v3.2d, v25.2d\n" + ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" + "ldr q24, [x9, #0x20]\n" + ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" + "ldr q0, [x9, #0x30]\n" + ".inst 0x4e98a769 // smmla v9.4s, v27.16b, v24.16b\n" + "trn2 v3.2d, v3.2d, v25.2d\n" + ".inst 0x4e98a751 // smmla v17.4s, v26.16b, v24.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e80a76d // smmla v13.4s, v27.16b, v0.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a755 // smmla v21.4s, v26.16b, v0.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x90]\n" + ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" + ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" "67:" // Height 3: Multiply loop: Main loop skip "cbz x27, 74f\n" "cmp x27, #0x8\n" "blt 69f\n" "68:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x9, #0x0]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr d25, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "trn1 v27.2d, v25.2d, v24.2d\n" + "ldr d24, [x24], #0x8\n" + "ldr q25, [x9, #0x0]\n" + "trn1 v26.2d, v24.2d, v26.2d\n" + ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" + "ldr q24, [x9, #0x10]\n" + ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" "sub x27, x27, #0x8\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" "cmp x27, #0x8\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" "add x9, x9, #0x80\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" "bge 68b\n" "69:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 74f\n" @@ -1082,74 +1081,74 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "73:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x9, #0x0]\n" - "ldr q6, [x9, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" + "ldr q25, [x9, #0x0]\n" + "ldr q28, [x9, #0x10]\n" + "trn1 v27.2d, v1.2d, v2.2d\n" + "trn1 v26.2d, v3.2d, v24.2d\n" + ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e9ca76c // smmla v12.4s, v27.16b, v28.16b\n" + ".inst 0x4e9ca754 // smmla v20.4s, v26.16b, v28.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" "add x9, x9, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" "74:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 62b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr q28, [x14, #0x0]\n" + "ldr q27, [x14, #0x10]\n" + "uzp1 v26.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q25, [x14, #0x20]\n" + "ldr q24, [x14, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp1 v13.2d, v10.2d, v14.2d\n" "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x24, x11, x20\n" + "add x25, x11, x20\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" - "add x23, x24, x20\n" + "add x24, x25, x20\n" "prfm pstl1keep, [x11, #0x0]\n" "uzp1 v16.2d, v16.2d, v20.2d\n" "uzp1 v17.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" "uzp1 v18.2d, v18.2d, v22.2d\n" "uzp1 v19.2d, v19.2d, v23.2d\n" "add x14, x14, #0x40\n" - "mov v23.16b, v7.16b\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v12.4s, v12.4s, v1.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v3.4s\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "mov v23.16b, v26.16b\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v12.4s, v12.4s, v27.4s\n" + "add v13.4s, v13.4s, v25.4s\n" + "add v14.4s, v14.4s, v24.4s\n" + "add v8.4s, v8.4s, v28.4s\n" + "add v9.4s, v9.4s, v27.4s\n" + "add v10.4s, v10.4s, v25.4s\n" + "add v11.4s, v11.4s, v24.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v27.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" "tbz %x[flags], #4, 75f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -1163,10 +1162,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 76f\n" "75:" // Height 3: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1187,55 +1186,55 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v18.4s, v18.4s, v6.4s\n" "sqrdmulh v19.4s, v19.4s, v7.4s\n" "tbz %x[flags], #5, 77f\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v12.16b, v1.16b\n" - "and v6.16b, v13.16b, v2.16b\n" - "and v7.16b, v14.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v12.4s, v12.4s, v5.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v7.4s\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v24.16b, v23.16b, v0.16b\n" + "and v22.16b, v12.16b, v1.16b\n" + "and v21.16b, v13.16b, v2.16b\n" + "and v20.16b, v14.16b, v3.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v24.4s\n" + "sqadd v12.4s, v12.4s, v22.4s\n" + "sqadd v13.4s, v13.4s, v21.4s\n" + "sqadd v14.4s, v14.4s, v20.4s\n" + "and v24.16b, v8.16b, v0.16b\n" + "and v22.16b, v9.16b, v1.16b\n" + "and v21.16b, v10.16b, v2.16b\n" + "and v20.16b, v11.16b, v3.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v24.4s\n" + "sqadd v9.4s, v9.4s, v22.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sqadd v11.4s, v11.4s, v20.4s\n" + "and v24.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v1.16b\n" + "and v21.16b, v18.16b, v2.16b\n" + "and v20.16b, v19.16b, v3.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v24.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "77:" // Height 3: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v22.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v21.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -1243,132 +1242,132 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add v23.4s, v23.4s, v22.4s\n" + "add v12.4s, v12.4s, v22.4s\n" + "add v13.4s, v13.4s, v22.4s\n" + "add v14.4s, v14.4s, v22.4s\n" + "add v8.4s, v8.4s, v22.4s\n" + "add v9.4s, v9.4s, v22.4s\n" + "add v10.4s, v10.4s, v22.4s\n" + "add v11.4s, v11.4s, v22.4s\n" + "add v16.4s, v16.4s, v22.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v22.4s\n" + "add v19.4s, v19.4s, v22.4s\n" + "smin v23.4s, v23.4s, v21.4s\n" + "smin v12.4s, v12.4s, v21.4s\n" + "smin v13.4s, v13.4s, v21.4s\n" + "smin v14.4s, v14.4s, v21.4s\n" + "smin v8.4s, v8.4s, v21.4s\n" + "smin v9.4s, v9.4s, v21.4s\n" + "smin v10.4s, v10.4s, v21.4s\n" + "smin v11.4s, v11.4s, v21.4s\n" + "smin v16.4s, v16.4s, v21.4s\n" + "smin v17.4s, v17.4s, v21.4s\n" + "smin v18.4s, v18.4s, v21.4s\n" + "smin v19.4s, v19.4s, v21.4s\n" + "smax v23.4s, v23.4s, v20.4s\n" + "smax v12.4s, v12.4s, v20.4s\n" + "smax v13.4s, v13.4s, v20.4s\n" + "smax v14.4s, v14.4s, v20.4s\n" + "smax v8.4s, v8.4s, v20.4s\n" + "smax v9.4s, v9.4s, v20.4s\n" + "smax v10.4s, v10.4s, v20.4s\n" + "smax v11.4s, v11.4s, v20.4s\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v23.8h, v23.8h, v12.8h\n" - "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v21.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v20.8h, v10.8h, v11.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v12.16b\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v23.16b, v23.16b, v21.16b\n" + "uzp1 v8.16b, v8.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 86f\n" "tbz x10, #3, 81f\n" "str d23, [x11], #0x8\n" - "str d8, [x24], #0x8\n" - "str d16, [x23], #0x8\n" + "str d8, [x25], #0x8\n" + "str d16, [x24], #0x8\n" "tbz x10, #2, 79f\n" "st1 { v23.s }[2], [x11], #0x4\n" - "st1 { v8.s }[2], [x24], #0x4\n" - "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v8.s }[2], [x25], #0x4\n" + "st1 { v16.s }[2], [x24], #0x4\n" "tbz x10, #1, 78f\n" "st1 { v23.h }[6], [x11], #0x2\n" - "st1 { v8.h }[6], [x24], #0x2\n" - "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v8.h }[6], [x25], #0x2\n" + "st1 { v16.h }[6], [x24], #0x2\n" "tbz x10, #0, 85f\n" "st1 { v23.b }[14], [x11]\n" - "st1 { v8.b }[14], [x24]\n" - "st1 { v16.b }[14], [x23]\n" + "st1 { v8.b }[14], [x25]\n" + "st1 { v16.b }[14], [x24]\n" "b 85f\n" "78:" // Height 3: Partial direct writeback: partial_1_12 "tbz x10, #0, 85f\n" "st1 { v23.b }[12], [x11]\n" - "st1 { v8.b }[12], [x24]\n" - "st1 { v16.b }[12], [x23]\n" + "st1 { v8.b }[12], [x25]\n" + "st1 { v16.b }[12], [x24]\n" "b 85f\n" "79:" // Height 3: Partial direct writeback: partial_2_8 "tbz x10, #1, 80f\n" "st1 { v23.h }[4], [x11], #0x2\n" - "st1 { v8.h }[4], [x24], #0x2\n" - "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v8.h }[4], [x25], #0x2\n" + "st1 { v16.h }[4], [x24], #0x2\n" "tbz x10, #0, 85f\n" "st1 { v23.b }[10], [x11]\n" - "st1 { v8.b }[10], [x24]\n" - "st1 { v16.b }[10], [x23]\n" + "st1 { v8.b }[10], [x25]\n" + "st1 { v16.b }[10], [x24]\n" "b 85f\n" "80:" // Height 3: Partial direct writeback: partial_1_8 "tbz x10, #0, 85f\n" "st1 { v23.b }[8], [x11]\n" - "st1 { v8.b }[8], [x24]\n" - "st1 { v16.b }[8], [x23]\n" + "st1 { v8.b }[8], [x25]\n" + "st1 { v16.b }[8], [x24]\n" "b 85f\n" "81:" // Height 3: Partial direct writeback: partial_4_0 "tbz x10, #2, 83f\n" "str s23, [x11], #0x4\n" - "str s8, [x24], #0x4\n" - "str s16, [x23], #0x4\n" + "str s8, [x25], #0x4\n" + "str s16, [x24], #0x4\n" "tbz x10, #1, 82f\n" "st1 { v23.h }[2], [x11], #0x2\n" - "st1 { v8.h }[2], [x24], #0x2\n" - "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v8.h }[2], [x25], #0x2\n" + "st1 { v16.h }[2], [x24], #0x2\n" "tbz x10, #0, 85f\n" "st1 { v23.b }[6], [x11]\n" - "st1 { v8.b }[6], [x24]\n" - "st1 { v16.b }[6], [x23]\n" + "st1 { v8.b }[6], [x25]\n" + "st1 { v16.b }[6], [x24]\n" "b 85f\n" "82:" // Height 3: Partial direct writeback: partial_1_4 "tbz x10, #0, 85f\n" "st1 { v23.b }[4], [x11]\n" - "st1 { v8.b }[4], [x24]\n" - "st1 { v16.b }[4], [x23]\n" + "st1 { v8.b }[4], [x25]\n" + "st1 { v16.b }[4], [x24]\n" "b 85f\n" "83:" // Height 3: Partial direct writeback: partial_2_0 "tbz x10, #1, 84f\n" "str h23, [x11], #0x2\n" - "str h8, [x24], #0x2\n" - "str h16, [x23], #0x2\n" + "str h8, [x25], #0x2\n" + "str h16, [x24], #0x2\n" "tbz x10, #0, 85f\n" "st1 { v23.b }[2], [x11]\n" - "st1 { v8.b }[2], [x24]\n" - "st1 { v16.b }[2], [x23]\n" + "st1 { v8.b }[2], [x25]\n" + "st1 { v16.b }[2], [x24]\n" "b 85f\n" "84:" // Height 3: Partial direct writeback: partial_1_0 "str b23, [x11, #0x0]\n" - "str b8, [x24, #0x0]\n" - "str b16, [x23, #0x0]\n" + "str b8, [x25, #0x0]\n" + "str b16, [x24, #0x0]\n" "85:" // Height 3: Partial direct writeback: Done "b 87f\n" "86:" // Height 3: Full writeback "str q23, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q8, [x24, #0x0]\n" - "str q16, [x23, #0x0]\n" + "str q8, [x25, #0x0]\n" + "str q16, [x24, #0x0]\n" "87:" // Height 3: Writeback done "subs x10, x10, #0x10\n" "bgt 60b\n" @@ -1402,14 +1401,14 @@ void a64_hybrid_s8qs_mmla_6x16 ( "91:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 92f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 93f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1419,9 +1418,9 @@ void a64_hybrid_s8qs_mmla_6x16 ( "b 93f\n" "92:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "93:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 96f\n" @@ -1434,173 +1433,173 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q6, [x9, #0x10]\n" "blt 95f\n" "94:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" "sub x27, x27, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "trn1 v26.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" "add x23, x23, #0x10\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" "cmp x27, #0x20\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x9, #0x10]\n" "bge 94b\n" "95:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a768 // smmla v8.4s, v27.16b, v7.16b\n" "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "trn1 v26.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a750 // smmla v16.4s, v26.16b, v7.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e86a76c // smmla v12.4s, v27.16b, v6.16b\n" + ".inst 0x4e86a754 // smmla v20.4s, v26.16b, v6.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" "add x23, x23, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x80]\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x90]\n" + ".inst 0x4e99a428 // smmla v8.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a470 // smmla v16.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xa0]\n" + ".inst 0x4e98a42c // smmla v12.4s, v1.16b, v24.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e98a474 // smmla v20.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xb0]\n" + ".inst 0x4e99a429 // smmla v9.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + ".inst 0x4e99a471 // smmla v17.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xc0]\n" + ".inst 0x4e98a42d // smmla v13.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a475 // smmla v21.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xd0]\n" + ".inst 0x4e99a42a // smmla v10.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a472 // smmla v18.4s, v3.16b, v25.16b\n" + "ldr q25, [x9, #0xe0]\n" + ".inst 0x4e98a42e // smmla v14.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a476 // smmla v22.4s, v3.16b, v24.16b\n" + "ldr q24, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e99a42b // smmla v11.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a473 // smmla v19.4s, v3.16b, v25.16b\n" + ".inst 0x4e98a42f // smmla v15.4s, v1.16b, v24.16b\n" + ".inst 0x4e98a477 // smmla v23.4s, v3.16b, v24.16b\n" "96:" // Height 4: Multiply loop: Main loop skip "cbz x27, 103f\n" "cmp x27, #0x8\n" "blt 98f\n" "97:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d25, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "trn1 v27.2d, v25.2d, v24.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "trn1 v26.2d, v25.2d, v24.2d\n" "cmp x27, #0x8\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x70]\n" + "ldr q25, [x9, #0x0]\n" + "ldr q24, [x9, #0x10]\n" + ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" "add x9, x9, #0x80\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" "bge 97b\n" "98:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 103f\n" @@ -1645,84 +1644,84 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" "102:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x9, #0x0]\n" - "ldr q6, [x9, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" + "ldr q25, [x9, #0x0]\n" + "ldr q24, [x9, #0x10]\n" + "trn1 v27.2d, v1.2d, v2.2d\n" + "trn1 v26.2d, v3.2d, v4.2d\n" + ".inst 0x4e99a768 // smmla v8.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a750 // smmla v16.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x20]\n" + ".inst 0x4e98a76c // smmla v12.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a754 // smmla v20.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x30]\n" + ".inst 0x4e99a769 // smmla v9.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a751 // smmla v17.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x40]\n" + ".inst 0x4e98a76d // smmla v13.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a755 // smmla v21.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x50]\n" + ".inst 0x4e99a76a // smmla v10.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a752 // smmla v18.4s, v26.16b, v25.16b\n" + "ldr q25, [x9, #0x60]\n" + ".inst 0x4e98a76e // smmla v14.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a756 // smmla v22.4s, v26.16b, v24.16b\n" + "ldr q24, [x9, #0x70]\n" "add x9, x9, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e99a76b // smmla v11.4s, v27.16b, v25.16b\n" + ".inst 0x4e99a753 // smmla v19.4s, v26.16b, v25.16b\n" + ".inst 0x4e98a76f // smmla v15.4s, v27.16b, v24.16b\n" + ".inst 0x4e98a757 // smmla v23.4s, v26.16b, v24.16b\n" "103:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 91b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr q28, [x14, #0x0]\n" + "ldr q27, [x14, #0x10]\n" + "uzp1 v26.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q25, [x14, #0x20]\n" + "ldr q24, [x14, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp1 v13.2d, v10.2d, v14.2d\n" "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x24, x11, x20\n" + "add x25, x11, x20\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" - "add x22, x23, x20\n" "uzp1 v15.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "prfm pstl1keep, [x11, #0x0]\n" - "prfm pstl1keep, [x24, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "add x14, x14, #0x40\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "mov v23.16b, v7.16b\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v12.4s, v12.4s, v1.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v3.4s\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" - "add v15.4s, v15.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "mov v23.16b, v26.16b\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v12.4s, v12.4s, v27.4s\n" + "add v13.4s, v13.4s, v25.4s\n" + "add v14.4s, v14.4s, v24.4s\n" + "add v8.4s, v8.4s, v28.4s\n" + "add v9.4s, v9.4s, v27.4s\n" + "add v10.4s, v10.4s, v25.4s\n" + "add v11.4s, v11.4s, v24.4s\n" + "add v15.4s, v15.4s, v28.4s\n" + "add v20.4s, v20.4s, v27.4s\n" + "add v21.4s, v21.4s, v25.4s\n" + "add v22.4s, v22.4s, v24.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v27.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" "tbz %x[flags], #4, 104f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -1736,10 +1735,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 105f\n" "104:" // Height 4: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -1763,68 +1762,68 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v17.4s, v17.4s, v5.4s\n" "sqrdmulh v18.4s, v18.4s, v6.4s\n" "sqrdmulh v19.4s, v19.4s, v7.4s\n" - "tbz %x[flags], #5, 106f\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v12.16b, v1.16b\n" - "and v6.16b, v13.16b, v2.16b\n" - "and v7.16b, v14.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v12.4s, v12.4s, v5.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v7.4s\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v15.16b, v0.16b\n" - "and v5.16b, v20.16b, v1.16b\n" - "and v6.16b, v21.16b, v2.16b\n" - "and v7.16b, v22.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "tbz %x[flags], #5, 106f\n" + "and v27.16b, v23.16b, v0.16b\n" + "and v26.16b, v12.16b, v1.16b\n" + "and v25.16b, v13.16b, v2.16b\n" + "and v24.16b, v14.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v27.4s\n" + "sqadd v12.4s, v12.4s, v26.4s\n" + "sqadd v13.4s, v13.4s, v25.4s\n" + "sqadd v14.4s, v14.4s, v24.4s\n" + "and v27.16b, v8.16b, v0.16b\n" + "and v26.16b, v9.16b, v1.16b\n" + "and v25.16b, v10.16b, v2.16b\n" + "and v24.16b, v11.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "sqadd v9.4s, v9.4s, v26.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v24.4s\n" + "and v27.16b, v15.16b, v0.16b\n" + "and v26.16b, v20.16b, v1.16b\n" + "and v25.16b, v21.16b, v2.16b\n" + "and v24.16b, v22.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v27.4s\n" + "sqadd v20.4s, v20.4s, v26.4s\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v27.16b, v16.16b, v0.16b\n" + "and v26.16b, v17.16b, v1.16b\n" + "and v25.16b, v18.16b, v2.16b\n" + "and v24.16b, v19.16b, v3.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v27.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "sqadd v18.4s, v18.4s, v25.4s\n" + "sqadd v19.4s, v19.4s, v24.4s\n" "106:" // Height 4: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -1836,163 +1835,163 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v17.4s, v17.4s, v1.4s\n" "srshl v18.4s, v18.4s, v2.4s\n" "srshl v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "add v12.4s, v12.4s, v26.4s\n" + "add v13.4s, v13.4s, v26.4s\n" + "add v14.4s, v14.4s, v26.4s\n" + "add v8.4s, v8.4s, v26.4s\n" + "add v9.4s, v9.4s, v26.4s\n" + "add v10.4s, v10.4s, v26.4s\n" + "add v11.4s, v11.4s, v26.4s\n" + "add v15.4s, v15.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smin v12.4s, v12.4s, v25.4s\n" + "smin v13.4s, v13.4s, v25.4s\n" + "smin v14.4s, v14.4s, v25.4s\n" + "smin v8.4s, v8.4s, v25.4s\n" + "smin v9.4s, v9.4s, v25.4s\n" + "smin v10.4s, v10.4s, v25.4s\n" + "smin v11.4s, v11.4s, v25.4s\n" + "smin v15.4s, v15.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "smax v12.4s, v12.4s, v24.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smax v14.4s, v14.4s, v24.4s\n" + "smax v8.4s, v8.4s, v24.4s\n" + "smax v9.4s, v9.4s, v24.4s\n" + "smax v10.4s, v10.4s, v24.4s\n" + "smax v11.4s, v11.4s, v24.4s\n" + "smax v15.4s, v15.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" "uzp1 v23.8h, v23.8h, v12.8h\n" - "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v25.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v24.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v12.16b\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v23.16b, v23.16b, v25.16b\n" + "uzp1 v8.16b, v8.16b, v24.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 115f\n" "tbz x10, #3, 110f\n" "str d23, [x11], #0x8\n" - "str d8, [x24], #0x8\n" - "str d15, [x23], #0x8\n" - "str d16, [x22], #0x8\n" + "str d8, [x25], #0x8\n" + "str d15, [x24], #0x8\n" + "str d16, [x23], #0x8\n" "tbz x10, #2, 108f\n" "st1 { v23.s }[2], [x11], #0x4\n" - "st1 { v8.s }[2], [x24], #0x4\n" - "st1 { v15.s }[2], [x23], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v8.s }[2], [x25], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" "tbz x10, #1, 107f\n" "st1 { v23.h }[6], [x11], #0x2\n" - "st1 { v8.h }[6], [x24], #0x2\n" - "st1 { v15.h }[6], [x23], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v8.h }[6], [x25], #0x2\n" + "st1 { v15.h }[6], [x24], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" "tbz x10, #0, 114f\n" "st1 { v23.b }[14], [x11]\n" - "st1 { v8.b }[14], [x24]\n" - "st1 { v15.b }[14], [x23]\n" - "st1 { v16.b }[14], [x22]\n" + "st1 { v8.b }[14], [x25]\n" + "st1 { v15.b }[14], [x24]\n" + "st1 { v16.b }[14], [x23]\n" "b 114f\n" "107:" // Height 4: Partial direct writeback: partial_1_12 "tbz x10, #0, 114f\n" "st1 { v23.b }[12], [x11]\n" - "st1 { v8.b }[12], [x24]\n" - "st1 { v15.b }[12], [x23]\n" - "st1 { v16.b }[12], [x22]\n" + "st1 { v8.b }[12], [x25]\n" + "st1 { v15.b }[12], [x24]\n" + "st1 { v16.b }[12], [x23]\n" "b 114f\n" "108:" // Height 4: Partial direct writeback: partial_2_8 "tbz x10, #1, 109f\n" "st1 { v23.h }[4], [x11], #0x2\n" - "st1 { v8.h }[4], [x24], #0x2\n" - "st1 { v15.h }[4], [x23], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v8.h }[4], [x25], #0x2\n" + "st1 { v15.h }[4], [x24], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" "tbz x10, #0, 114f\n" "st1 { v23.b }[10], [x11]\n" - "st1 { v8.b }[10], [x24]\n" - "st1 { v15.b }[10], [x23]\n" - "st1 { v16.b }[10], [x22]\n" + "st1 { v8.b }[10], [x25]\n" + "st1 { v15.b }[10], [x24]\n" + "st1 { v16.b }[10], [x23]\n" "b 114f\n" "109:" // Height 4: Partial direct writeback: partial_1_8 "tbz x10, #0, 114f\n" "st1 { v23.b }[8], [x11]\n" - "st1 { v8.b }[8], [x24]\n" - "st1 { v15.b }[8], [x23]\n" - "st1 { v16.b }[8], [x22]\n" + "st1 { v8.b }[8], [x25]\n" + "st1 { v15.b }[8], [x24]\n" + "st1 { v16.b }[8], [x23]\n" "b 114f\n" "110:" // Height 4: Partial direct writeback: partial_4_0 "tbz x10, #2, 112f\n" "str s23, [x11], #0x4\n" - "str s8, [x24], #0x4\n" - "str s15, [x23], #0x4\n" - "str s16, [x22], #0x4\n" + "str s8, [x25], #0x4\n" + "str s15, [x24], #0x4\n" + "str s16, [x23], #0x4\n" "tbz x10, #1, 111f\n" "st1 { v23.h }[2], [x11], #0x2\n" - "st1 { v8.h }[2], [x24], #0x2\n" - "st1 { v15.h }[2], [x23], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v8.h }[2], [x25], #0x2\n" + "st1 { v15.h }[2], [x24], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" "tbz x10, #0, 114f\n" "st1 { v23.b }[6], [x11]\n" - "st1 { v8.b }[6], [x24]\n" - "st1 { v15.b }[6], [x23]\n" - "st1 { v16.b }[6], [x22]\n" + "st1 { v8.b }[6], [x25]\n" + "st1 { v15.b }[6], [x24]\n" + "st1 { v16.b }[6], [x23]\n" "b 114f\n" "111:" // Height 4: Partial direct writeback: partial_1_4 "tbz x10, #0, 114f\n" "st1 { v23.b }[4], [x11]\n" - "st1 { v8.b }[4], [x24]\n" - "st1 { v15.b }[4], [x23]\n" - "st1 { v16.b }[4], [x22]\n" + "st1 { v8.b }[4], [x25]\n" + "st1 { v15.b }[4], [x24]\n" + "st1 { v16.b }[4], [x23]\n" "b 114f\n" "112:" // Height 4: Partial direct writeback: partial_2_0 "tbz x10, #1, 113f\n" "str h23, [x11], #0x2\n" - "str h8, [x24], #0x2\n" - "str h15, [x23], #0x2\n" - "str h16, [x22], #0x2\n" + "str h8, [x25], #0x2\n" + "str h15, [x24], #0x2\n" + "str h16, [x23], #0x2\n" "tbz x10, #0, 114f\n" "st1 { v23.b }[2], [x11]\n" - "st1 { v8.b }[2], [x24]\n" - "st1 { v15.b }[2], [x23]\n" - "st1 { v16.b }[2], [x22]\n" + "st1 { v8.b }[2], [x25]\n" + "st1 { v15.b }[2], [x24]\n" + "st1 { v16.b }[2], [x23]\n" "b 114f\n" "113:" // Height 4: Partial direct writeback: partial_1_0 "str b23, [x11, #0x0]\n" - "str b8, [x24, #0x0]\n" - "str b15, [x23, #0x0]\n" - "str b16, [x22, #0x0]\n" + "str b8, [x25, #0x0]\n" + "str b15, [x24, #0x0]\n" + "str b16, [x23, #0x0]\n" "114:" // Height 4: Partial direct writeback: Done "b 116f\n" "115:" // Height 4: Full writeback "str q23, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q8, [x24, #0x0]\n" - "str q15, [x23, #0x0]\n" - "str q16, [x22, #0x0]\n" + "str q8, [x25, #0x0]\n" + "str q15, [x24, #0x0]\n" + "str q16, [x23, #0x0]\n" "116:" // Height 4: Writeback done "subs x10, x10, #0x10\n" "bgt 89b\n" @@ -2034,15 +2033,15 @@ void a64_hybrid_s8qs_mmla_6x16 ( "120:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 121f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 122f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2053,10 +2052,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "b 122f\n" "121:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "122:" // Height 5: input setup done "cmp x27, #0x10\n" "blt 125f\n" @@ -2120,42 +2119,42 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" "ldr q2, [x25, #0x0]\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + "ldr q0, [x9, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "ldr q6, [x9, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xb0]\n" + ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xd0]\n" + ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "bge 123b\n" "124:" // Height 5: Multiply loop: Single iteration only @@ -2208,86 +2207,86 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + "ldr q0, [x9, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "ldr q2, [x9, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xb0]\n" + ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n" + "ldr q2, [x9, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xd0]\n" + ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n" + "ldr q2, [x9, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "125:" // Height 5: Multiply loop: Main loop skip "cbz x27, 132f\n" "cmp x27, #0x8\n" "blt 127f\n" "126:" // Height 5: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x9, #0x0]\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr d0, [x22], #0x8\n" + "ldr q1, [x9, #0x0]\n" + "trn1 v2.2d, v0.2d, v2.2d\n" + ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x20]\n" + ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" "cmp x27, #0x8\n" - ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x70]\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x30]\n" + ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x40]\n" + ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x50]\n" + ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x60]\n" + ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x70]\n" + ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n" "add x9, x9, #0x80\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" + ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" "bge 126b\n" "127:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 132f\n" @@ -2340,74 +2339,74 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr b5, [x22, #0x0]\n" "131:" // Height 5: Multiply loop: Ragged operand read: Done "ldr q7, [x9, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" + "trn1 v6.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v5.2d, v0.2d\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n" + ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" + "ldr q1, [x9, #0x20]\n" + ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x30]\n" + ".inst 0x4e81a4c9 // smmla v9.4s, v6.16b, v1.16b\n" + ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x40]\n" + ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x50]\n" + ".inst 0x4e81a4ca // smmla v10.4s, v6.16b, v1.16b\n" + ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x60]\n" + ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x70]\n" "add x9, x9, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + ".inst 0x4e81a4cb // smmla v11.4s, v6.16b, v1.16b\n" + ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" + ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" "132:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 120b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr q4, [x14, #0x0]\n" + "ldr q3, [x14, #0x10]\n" + "uzp1 v2.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q1, [x14, #0x20]\n" + "ldr q0, [x14, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" + "add x25, x11, x20\n" "uzp1 v13.2d, v10.2d, v14.2d\n" "uzp2 v10.2d, v10.2d, v14.2d\n" "uzp1 v14.2d, v11.2d, v15.2d\n" "uzp2 v11.2d, v11.2d, v15.2d\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" - "add x22, x23, x20\n" "uzp1 v15.2d, v16.2d, v20.2d\n" "uzp2 v16.2d, v16.2d, v20.2d\n" - "add x21, x22, x20\n" + "add x22, x23, x20\n" "prfm pstl1keep, [x11, #0x0]\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" "add x14, x14, #0x40\n" @@ -2415,27 +2414,27 @@ void a64_hybrid_s8qs_mmla_6x16 ( "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v7.16b\n" - "add v31.4s, v31.4s, v0.4s\n" - "add v12.4s, v12.4s, v1.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v3.4s\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" - "add v15.4s, v15.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "mov v31.16b, v2.16b\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v0.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v3.4s\n" + "add v10.4s, v10.4s, v1.4s\n" + "add v11.4s, v11.4s, v0.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" "tbz %x[flags], #4, 133f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -2449,10 +2448,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 134f\n" "133:" // Height 5: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -2481,79 +2480,79 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v27.4s, v27.4s, v7.4s\n" "tbz %x[flags], #5, 135f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v12.16b, v1.16b\n" - "and v6.16b, v13.16b, v2.16b\n" - "and v7.16b, v14.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v12.4s, v12.4s, v5.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v7.4s\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v15.16b, v0.16b\n" - "and v5.16b, v20.16b, v1.16b\n" - "and v6.16b, v21.16b, v2.16b\n" - "and v7.16b, v22.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" + "and v30.16b, v31.16b, v0.16b\n" + "and v29.16b, v12.16b, v1.16b\n" + "and v28.16b, v13.16b, v2.16b\n" + "and v23.16b, v14.16b, v3.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v30.4s\n" + "sqadd v12.4s, v12.4s, v29.4s\n" + "sqadd v13.4s, v13.4s, v28.4s\n" + "sqadd v14.4s, v14.4s, v23.4s\n" + "and v30.16b, v8.16b, v0.16b\n" + "and v29.16b, v9.16b, v1.16b\n" + "and v28.16b, v10.16b, v2.16b\n" + "and v23.16b, v11.16b, v3.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v30.4s\n" + "sqadd v9.4s, v9.4s, v29.4s\n" + "sqadd v10.4s, v10.4s, v28.4s\n" + "sqadd v11.4s, v11.4s, v23.4s\n" + "and v30.16b, v15.16b, v0.16b\n" + "and v29.16b, v20.16b, v1.16b\n" + "and v28.16b, v21.16b, v2.16b\n" + "and v23.16b, v22.16b, v3.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v21.4s, v21.4s, v28.4s\n" + "sqadd v22.4s, v22.4s, v23.4s\n" + "and v30.16b, v16.16b, v0.16b\n" + "and v29.16b, v17.16b, v1.16b\n" + "and v28.16b, v18.16b, v2.16b\n" + "and v23.16b, v19.16b, v3.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v30.4s\n" + "sqadd v17.4s, v17.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v28.4s\n" + "sqadd v19.4s, v19.4s, v23.4s\n" + "and v30.16b, v24.16b, v0.16b\n" + "and v29.16b, v25.16b, v1.16b\n" + "and v28.16b, v26.16b, v2.16b\n" + "and v23.16b, v27.16b, v3.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v30.4s\n" + "sqadd v25.4s, v25.4s, v29.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "sqadd v27.4s, v27.4s, v23.4s\n" "135:" // Height 5: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v23.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -2569,194 +2568,194 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v29.4s\n" + "add v12.4s, v12.4s, v29.4s\n" + "add v13.4s, v13.4s, v29.4s\n" + "add v14.4s, v14.4s, v29.4s\n" + "add v8.4s, v8.4s, v29.4s\n" + "add v9.4s, v9.4s, v29.4s\n" + "add v10.4s, v10.4s, v29.4s\n" + "add v11.4s, v11.4s, v29.4s\n" + "add v15.4s, v15.4s, v29.4s\n" + "add v20.4s, v20.4s, v29.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v16.4s, v16.4s, v29.4s\n" + "add v17.4s, v17.4s, v29.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v29.4s\n" + "smin v31.4s, v31.4s, v28.4s\n" + "smin v12.4s, v12.4s, v28.4s\n" + "smin v13.4s, v13.4s, v28.4s\n" + "smin v14.4s, v14.4s, v28.4s\n" + "smin v8.4s, v8.4s, v28.4s\n" + "smin v9.4s, v9.4s, v28.4s\n" + "smin v10.4s, v10.4s, v28.4s\n" + "smin v11.4s, v11.4s, v28.4s\n" + "smin v15.4s, v15.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "smax v31.4s, v31.4s, v23.4s\n" + "smax v12.4s, v12.4s, v23.4s\n" + "smax v13.4s, v13.4s, v23.4s\n" + "smax v14.4s, v14.4s, v23.4s\n" + "smax v8.4s, v8.4s, v23.4s\n" + "smax v9.4s, v9.4s, v23.4s\n" + "smax v10.4s, v10.4s, v23.4s\n" + "smax v11.4s, v11.4s, v23.4s\n" + "smax v15.4s, v15.4s, v23.4s\n" + "smax v20.4s, v20.4s, v23.4s\n" + "smax v21.4s, v21.4s, v23.4s\n" + "smax v22.4s, v22.4s, v23.4s\n" + "smax v16.4s, v16.4s, v23.4s\n" + "smax v17.4s, v17.4s, v23.4s\n" + "smax v18.4s, v18.4s, v23.4s\n" + "smax v19.4s, v19.4s, v23.4s\n" + "smax v24.4s, v24.4s, v23.4s\n" + "smax v25.4s, v25.4s, v23.4s\n" + "smax v26.4s, v26.4s, v23.4s\n" + "smax v27.4s, v27.4s, v23.4s\n" "uzp1 v31.8h, v31.8h, v12.8h\n" - "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v28.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v23.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "uzp1 v31.16b, v31.16b, v12.16b\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v28.16b\n" + "uzp1 v8.16b, v8.16b, v23.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 144f\n" "tbz x10, #3, 139f\n" "str d31, [x11], #0x8\n" - "str d8, [x24], #0x8\n" - "str d15, [x23], #0x8\n" - "str d16, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d8, [x25], #0x8\n" + "str d15, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x10, #2, 137f\n" "st1 { v31.s }[2], [x11], #0x4\n" - "st1 { v8.s }[2], [x24], #0x4\n" - "st1 { v15.s }[2], [x23], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v8.s }[2], [x25], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x10, #1, 136f\n" "st1 { v31.h }[6], [x11], #0x2\n" - "st1 { v8.h }[6], [x24], #0x2\n" - "st1 { v15.h }[6], [x23], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v8.h }[6], [x25], #0x2\n" + "st1 { v15.h }[6], [x24], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x10, #0, 143f\n" "st1 { v31.b }[14], [x11]\n" - "st1 { v8.b }[14], [x24]\n" - "st1 { v15.b }[14], [x23]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v8.b }[14], [x25]\n" + "st1 { v15.b }[14], [x24]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 143f\n" "136:" // Height 5: Partial direct writeback: partial_1_12 "tbz x10, #0, 143f\n" "st1 { v31.b }[12], [x11]\n" - "st1 { v8.b }[12], [x24]\n" - "st1 { v15.b }[12], [x23]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v8.b }[12], [x25]\n" + "st1 { v15.b }[12], [x24]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 143f\n" "137:" // Height 5: Partial direct writeback: partial_2_8 "tbz x10, #1, 138f\n" "st1 { v31.h }[4], [x11], #0x2\n" - "st1 { v8.h }[4], [x24], #0x2\n" - "st1 { v15.h }[4], [x23], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v8.h }[4], [x25], #0x2\n" + "st1 { v15.h }[4], [x24], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x10, #0, 143f\n" "st1 { v31.b }[10], [x11]\n" - "st1 { v8.b }[10], [x24]\n" - "st1 { v15.b }[10], [x23]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v8.b }[10], [x25]\n" + "st1 { v15.b }[10], [x24]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 143f\n" "138:" // Height 5: Partial direct writeback: partial_1_8 "tbz x10, #0, 143f\n" "st1 { v31.b }[8], [x11]\n" - "st1 { v8.b }[8], [x24]\n" - "st1 { v15.b }[8], [x23]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v8.b }[8], [x25]\n" + "st1 { v15.b }[8], [x24]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 143f\n" "139:" // Height 5: Partial direct writeback: partial_4_0 "tbz x10, #2, 141f\n" "str s31, [x11], #0x4\n" - "str s8, [x24], #0x4\n" - "str s15, [x23], #0x4\n" - "str s16, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s8, [x25], #0x4\n" + "str s15, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x10, #1, 140f\n" "st1 { v31.h }[2], [x11], #0x2\n" - "st1 { v8.h }[2], [x24], #0x2\n" - "st1 { v15.h }[2], [x23], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v8.h }[2], [x25], #0x2\n" + "st1 { v15.h }[2], [x24], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x10, #0, 143f\n" "st1 { v31.b }[6], [x11]\n" - "st1 { v8.b }[6], [x24]\n" - "st1 { v15.b }[6], [x23]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v8.b }[6], [x25]\n" + "st1 { v15.b }[6], [x24]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 143f\n" "140:" // Height 5: Partial direct writeback: partial_1_4 "tbz x10, #0, 143f\n" "st1 { v31.b }[4], [x11]\n" - "st1 { v8.b }[4], [x24]\n" - "st1 { v15.b }[4], [x23]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v8.b }[4], [x25]\n" + "st1 { v15.b }[4], [x24]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 143f\n" "141:" // Height 5: Partial direct writeback: partial_2_0 "tbz x10, #1, 142f\n" "str h31, [x11], #0x2\n" - "str h8, [x24], #0x2\n" - "str h15, [x23], #0x2\n" - "str h16, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h8, [x25], #0x2\n" + "str h15, [x24], #0x2\n" + "str h16, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x10, #0, 143f\n" "st1 { v31.b }[2], [x11]\n" - "st1 { v8.b }[2], [x24]\n" - "st1 { v15.b }[2], [x23]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v8.b }[2], [x25]\n" + "st1 { v15.b }[2], [x24]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 143f\n" "142:" // Height 5: Partial direct writeback: partial_1_0 "str b31, [x11, #0x0]\n" - "str b8, [x24, #0x0]\n" - "str b15, [x23, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b8, [x25, #0x0]\n" + "str b15, [x24, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "143:" // Height 5: Partial direct writeback: Done "b 145f\n" "144:" // Height 5: Full writeback "str q31, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q8, [x24, #0x0]\n" - "str q15, [x23, #0x0]\n" - "str q16, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q8, [x25, #0x0]\n" + "str q15, [x24, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "145:" // Height 5: Writeback done "subs x10, x10, #0x10\n" "bgt 118b\n" @@ -2801,16 +2800,16 @@ void a64_hybrid_s8qs_mmla_6x16 ( "149:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 150f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 151f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2822,11 +2821,11 @@ void a64_hybrid_s8qs_mmla_6x16 ( "b 151f\n" "150:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "151:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 154f\n" @@ -2893,42 +2892,42 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr q2, [x25, #0x0]\n" "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + "ldr q0, [x9, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "ldr q6, [x9, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xb0]\n" + ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xd0]\n" + ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x9, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x9, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "bge 152b\n" @@ -2984,87 +2983,87 @@ void a64_hybrid_s8qs_mmla_6x16 ( ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x90]\n" + "ldr q0, [x9, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x9, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x9, #0xf0]\n" + "ldr q2, [x9, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xb0]\n" + ".inst 0x4e82a429 // smmla v9.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a471 // smmla v17.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4b9 // smmla v25.4s, v5.16b, v2.16b\n" + "ldr q2, [x9, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xd0]\n" + ".inst 0x4e82a42a // smmla v10.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a472 // smmla v18.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4ba // smmla v26.4s, v5.16b, v2.16b\n" + "ldr q2, [x9, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x9, #0xf0]\n" "add x9, x9, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e82a42b // smmla v11.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a473 // smmla v19.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bb // smmla v27.4s, v5.16b, v2.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "154:" // Height 6: Multiply loop: Main loop skip "cbz x27, 161f\n" "cmp x27, #0x8\n" "blt 156f\n" "155:" // Height 6: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "cmp x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q6, [x9, #0x0]\n" - "ldr q7, [x9, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x70]\n" + "ldr d1, [x22], #0x8\n" + "ldr d0, [x21], #0x8\n" + "trn1 v2.2d, v1.2d, v0.2d\n" + "ldr q1, [x9, #0x0]\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x20]\n" + ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x30]\n" + ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x40]\n" + ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x50]\n" + ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" + "ldr q1, [x9, #0x60]\n" + ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x9, #0x70]\n" "add x9, x9, #0x80\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + ".inst 0x4e81a48b // smmla v11.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a473 // smmla v19.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45b // smmla v27.4s, v2.16b, v1.16b\n" + ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" "bge 155b\n" "156:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 161f\n" @@ -3124,77 +3123,77 @@ void a64_hybrid_s8qs_mmla_6x16 ( "ldr b6, [x21, #0x0]\n" "160:" // Height 6: Multiply loop: Ragged operand read: Done "ldr q7, [x9, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x9, #0x10]\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x9, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x9, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + "trn1 v2.2d, v1.2d, v2.2d\n" + "trn1 v4.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a448 // smmla v8.4s, v2.16b, v7.16b\n" + "trn1 v3.2d, v5.2d, v6.2d\n" + "ldr q0, [x9, #0x10]\n" + ".inst 0x4e87a490 // smmla v16.4s, v4.16b, v7.16b\n" + ".inst 0x4e87a478 // smmla v24.4s, v3.16b, v7.16b\n" + "ldr q1, [x9, #0x20]\n" + ".inst 0x4e80a44c // smmla v12.4s, v2.16b, v0.16b\n" + ".inst 0x4e80a494 // smmla v20.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a47c // smmla v28.4s, v3.16b, v0.16b\n" + "ldr q0, [x9, #0x30]\n" + ".inst 0x4e81a449 // smmla v9.4s, v2.16b, v1.16b\n" + ".inst 0x4e81a491 // smmla v17.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a479 // smmla v25.4s, v3.16b, v1.16b\n" + "ldr q1, [x9, #0x40]\n" + ".inst 0x4e80a44d // smmla v13.4s, v2.16b, v0.16b\n" + ".inst 0x4e80a495 // smmla v21.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a47d // smmla v29.4s, v3.16b, v0.16b\n" + "ldr q0, [x9, #0x50]\n" + ".inst 0x4e81a44a // smmla v10.4s, v2.16b, v1.16b\n" + ".inst 0x4e81a492 // smmla v18.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a47a // smmla v26.4s, v3.16b, v1.16b\n" + "ldr q1, [x9, #0x60]\n" + ".inst 0x4e80a44e // smmla v14.4s, v2.16b, v0.16b\n" + ".inst 0x4e80a496 // smmla v22.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a47e // smmla v30.4s, v3.16b, v0.16b\n" + "ldr q0, [x9, #0x70]\n" + ".inst 0x4e81a44b // smmla v11.4s, v2.16b, v1.16b\n" "add x9, x9, #0x80\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + ".inst 0x4e81a493 // smmla v19.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a47b // smmla v27.4s, v3.16b, v1.16b\n" + ".inst 0x4e80a44f // smmla v15.4s, v2.16b, v0.16b\n" + ".inst 0x4e80a497 // smmla v23.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a47f // smmla v31.4s, v3.16b, v0.16b\n" "161:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 149b\n" - "ldr q0, [x14, #0x0]\n" - "ldr q1, [x14, #0x10]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" + "ldr q4, [x14, #0x0]\n" + "ldr q3, [x14, #0x10]\n" + "uzp1 v2.2d, v8.2d, v12.2d\n" "uzp2 v8.2d, v8.2d, v12.2d\n" - "ldr q2, [x14, #0x20]\n" - "ldr q3, [x14, #0x30]\n" + "ldr q1, [x14, #0x20]\n" + "ldr q0, [x14, #0x30]\n" "uzp1 v12.2d, v9.2d, v13.2d\n" "uzp2 v9.2d, v9.2d, v13.2d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" + "add x25, x11, x20\n" + "add x24, x25, x20\n" "uzp1 v13.2d, v10.2d, v14.2d\n" "uzp2 v10.2d, v10.2d, v14.2d\n" "uzp1 v14.2d, v11.2d, v15.2d\n" + "add x23, x24, x20\n" "add x22, x23, x20\n" - "add x21, x22, x20\n" "uzp2 v11.2d, v11.2d, v15.2d\n" "uzp1 v15.2d, v16.2d, v20.2d\n" - "add x20, x21, x20\n" + "add x21, x22, x20\n" "prfm pstl1keep, [x11, #0x0]\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x20, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" "add x14, x14, #0x40\n" "uzp2 v19.2d, v19.2d, v23.2d\n" "uzp1 v23.2d, v24.2d, v28.2d\n" @@ -3205,31 +3204,31 @@ void a64_hybrid_s8qs_mmla_6x16 ( "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v7.16b\n" - "add v31.4s, v31.4s, v0.4s\n" - "add v12.4s, v12.4s, v1.4s\n" - "add v13.4s, v13.4s, v2.4s\n" - "add v14.4s, v14.4s, v3.4s\n" - "add v8.4s, v8.4s, v0.4s\n" - "add v9.4s, v9.4s, v1.4s\n" - "add v10.4s, v10.4s, v2.4s\n" - "add v11.4s, v11.4s, v3.4s\n" - "add v15.4s, v15.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v1.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "mov v31.16b, v2.16b\n" + "add v31.4s, v31.4s, v4.4s\n" + "add v12.4s, v12.4s, v3.4s\n" + "add v13.4s, v13.4s, v1.4s\n" + "add v14.4s, v14.4s, v0.4s\n" + "add v8.4s, v8.4s, v4.4s\n" + "add v9.4s, v9.4s, v3.4s\n" + "add v10.4s, v10.4s, v1.4s\n" + "add v11.4s, v11.4s, v0.4s\n" + "add v15.4s, v15.4s, v4.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v1.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "add v16.4s, v16.4s, v4.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v23.4s, v23.4s, v4.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v1.4s\n" + "add v30.4s, v30.4s, v0.4s\n" + "add v24.4s, v24.4s, v4.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" "tbz %x[flags], #4, 162f\n" "ldr q0, [x12, #0x0]\n" "ldr q4, [x13, #0x0]\n" @@ -3243,10 +3242,10 @@ void a64_hybrid_s8qs_mmla_6x16 ( "add x13, x13, #0x40\n" "b 163f\n" "162:" // Height 6: per layer parameters - "add x25, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x25]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v4.4s }, [x20]\n" "mov v1.16b, v0.16b\n" "mov v5.16b, v4.16b\n" "mov v2.16b, v0.16b\n" @@ -3279,91 +3278,91 @@ void a64_hybrid_s8qs_mmla_6x16 ( "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v27.4s, v27.4s, v7.4s\n" "tbz %x[flags], #5, 164f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v12.16b, v1.16b\n" - "and v6.16b, v13.16b, v2.16b\n" - "and v7.16b, v14.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v7.16b, v31.16b, v0.16b\n" + "and v6.16b, v12.16b, v1.16b\n" + "and v5.16b, v13.16b, v2.16b\n" + "and v4.16b, v14.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v12.4s, v12.4s, v5.4s\n" - "sqadd v13.4s, v13.4s, v6.4s\n" - "sqadd v14.4s, v14.4s, v7.4s\n" - "and v4.16b, v8.16b, v0.16b\n" - "and v5.16b, v9.16b, v1.16b\n" - "and v6.16b, v10.16b, v2.16b\n" - "and v7.16b, v11.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v8.4s, v8.4s, v4.4s\n" - "sqadd v9.4s, v9.4s, v5.4s\n" - "sqadd v10.4s, v10.4s, v6.4s\n" - "sqadd v11.4s, v11.4s, v7.4s\n" - "and v4.16b, v15.16b, v0.16b\n" - "and v5.16b, v20.16b, v1.16b\n" - "and v6.16b, v21.16b, v2.16b\n" - "and v7.16b, v22.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v1.16b\n" - "and v6.16b, v18.16b, v2.16b\n" - "and v7.16b, v19.16b, v3.16b\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v7.4s\n" + "sqadd v12.4s, v12.4s, v6.4s\n" + "sqadd v13.4s, v13.4s, v5.4s\n" + "sqadd v14.4s, v14.4s, v4.4s\n" + "and v7.16b, v8.16b, v0.16b\n" + "and v6.16b, v9.16b, v1.16b\n" + "and v5.16b, v10.16b, v2.16b\n" + "and v4.16b, v11.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v28.16b, v1.16b\n" - "and v6.16b, v29.16b, v2.16b\n" - "and v7.16b, v30.16b, v3.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v8.4s, v8.4s, v7.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "sqadd v10.4s, v10.4s, v5.4s\n" + "sqadd v11.4s, v11.4s, v4.4s\n" + "and v7.16b, v15.16b, v0.16b\n" + "and v6.16b, v20.16b, v1.16b\n" + "and v5.16b, v21.16b, v2.16b\n" + "and v4.16b, v22.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v28.4s, v28.4s, v5.4s\n" - "sqadd v29.4s, v29.4s, v6.4s\n" - "sqadd v30.4s, v30.4s, v7.4s\n" - "and v4.16b, v24.16b, v0.16b\n" - "and v5.16b, v25.16b, v1.16b\n" - "and v6.16b, v26.16b, v2.16b\n" - "and v7.16b, v27.16b, v3.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v7.4s\n" + "sqadd v20.4s, v20.4s, v6.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "sqadd v22.4s, v22.4s, v4.4s\n" + "and v7.16b, v16.16b, v0.16b\n" + "and v6.16b, v17.16b, v1.16b\n" + "and v5.16b, v18.16b, v2.16b\n" + "and v4.16b, v19.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v7.4s\n" + "sqadd v17.4s, v17.4s, v6.4s\n" + "sqadd v18.4s, v18.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "and v7.16b, v23.16b, v0.16b\n" + "and v6.16b, v28.16b, v1.16b\n" + "and v5.16b, v29.16b, v2.16b\n" + "and v4.16b, v30.16b, v3.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v7.4s\n" + "sqadd v28.4s, v28.4s, v6.4s\n" + "sqadd v29.4s, v29.4s, v5.4s\n" + "sqadd v30.4s, v30.4s, v4.4s\n" + "and v7.16b, v24.16b, v0.16b\n" + "and v6.16b, v25.16b, v1.16b\n" + "and v5.16b, v26.16b, v2.16b\n" + "and v4.16b, v27.16b, v3.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v24.4s, v24.4s, v4.4s\n" - "sqadd v25.4s, v25.4s, v5.4s\n" - "sqadd v26.4s, v26.4s, v6.4s\n" - "sqadd v27.4s, v27.4s, v7.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v24.4s, v24.4s, v7.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "sqadd v27.4s, v27.4s, v4.4s\n" "164:" // Height 6: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v6.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v12.4s, v12.4s, v1.4s\n" "srshl v13.4s, v13.4s, v2.4s\n" "srshl v14.4s, v14.4s, v3.4s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v5.4s }, [x20]\n" "srshl v8.4s, v8.4s, v0.4s\n" "srshl v9.4s, v9.4s, v1.4s\n" - "add x25, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x25]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v4.4s }, [x20]\n" "srshl v10.4s, v10.4s, v2.4s\n" "srshl v11.4s, v11.4s, v3.4s\n" "cmp x10, #0x10\n" @@ -3383,225 +3382,225 @@ void a64_hybrid_s8qs_mmla_6x16 ( "srshl v25.4s, v25.4s, v1.4s\n" "srshl v26.4s, v26.4s, v2.4s\n" "srshl v27.4s, v27.4s, v3.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v12.4s, v12.4s, v4.4s\n" - "add v13.4s, v13.4s, v4.4s\n" - "add v14.4s, v14.4s, v4.4s\n" - "add v8.4s, v8.4s, v4.4s\n" - "add v9.4s, v9.4s, v4.4s\n" - "add v10.4s, v10.4s, v4.4s\n" - "add v11.4s, v11.4s, v4.4s\n" - "add v15.4s, v15.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v12.4s, v12.4s, v6.4s\n" - "smin v13.4s, v13.4s, v6.4s\n" - "smin v14.4s, v14.4s, v6.4s\n" - "smin v8.4s, v8.4s, v6.4s\n" - "smin v9.4s, v9.4s, v6.4s\n" - "smin v10.4s, v10.4s, v6.4s\n" - "smin v11.4s, v11.4s, v6.4s\n" - "smin v15.4s, v15.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v12.4s, v12.4s, v5.4s\n" - "smax v13.4s, v13.4s, v5.4s\n" - "smax v14.4s, v14.4s, v5.4s\n" - "smax v8.4s, v8.4s, v5.4s\n" - "smax v9.4s, v9.4s, v5.4s\n" - "smax v10.4s, v10.4s, v5.4s\n" - "smax v11.4s, v11.4s, v5.4s\n" - "smax v15.4s, v15.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v6.4s\n" + "add v12.4s, v12.4s, v6.4s\n" + "add v13.4s, v13.4s, v6.4s\n" + "add v14.4s, v14.4s, v6.4s\n" + "add v8.4s, v8.4s, v6.4s\n" + "add v9.4s, v9.4s, v6.4s\n" + "add v10.4s, v10.4s, v6.4s\n" + "add v11.4s, v11.4s, v6.4s\n" + "add v15.4s, v15.4s, v6.4s\n" + "add v20.4s, v20.4s, v6.4s\n" + "add v21.4s, v21.4s, v6.4s\n" + "add v22.4s, v22.4s, v6.4s\n" + "add v16.4s, v16.4s, v6.4s\n" + "add v17.4s, v17.4s, v6.4s\n" + "add v18.4s, v18.4s, v6.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "add v23.4s, v23.4s, v6.4s\n" + "add v28.4s, v28.4s, v6.4s\n" + "add v29.4s, v29.4s, v6.4s\n" + "add v30.4s, v30.4s, v6.4s\n" + "add v24.4s, v24.4s, v6.4s\n" + "add v25.4s, v25.4s, v6.4s\n" + "add v26.4s, v26.4s, v6.4s\n" + "add v27.4s, v27.4s, v6.4s\n" + "smin v31.4s, v31.4s, v5.4s\n" + "smin v12.4s, v12.4s, v5.4s\n" + "smin v13.4s, v13.4s, v5.4s\n" + "smin v14.4s, v14.4s, v5.4s\n" + "smin v8.4s, v8.4s, v5.4s\n" + "smin v9.4s, v9.4s, v5.4s\n" + "smin v10.4s, v10.4s, v5.4s\n" + "smin v11.4s, v11.4s, v5.4s\n" + "smin v15.4s, v15.4s, v5.4s\n" + "smin v20.4s, v20.4s, v5.4s\n" + "smin v21.4s, v21.4s, v5.4s\n" + "smin v22.4s, v22.4s, v5.4s\n" + "smin v16.4s, v16.4s, v5.4s\n" + "smin v17.4s, v17.4s, v5.4s\n" + "smin v18.4s, v18.4s, v5.4s\n" + "smin v19.4s, v19.4s, v5.4s\n" + "smin v23.4s, v23.4s, v5.4s\n" + "smin v28.4s, v28.4s, v5.4s\n" + "smin v29.4s, v29.4s, v5.4s\n" + "smin v30.4s, v30.4s, v5.4s\n" + "smin v24.4s, v24.4s, v5.4s\n" + "smin v25.4s, v25.4s, v5.4s\n" + "smin v26.4s, v26.4s, v5.4s\n" + "smin v27.4s, v27.4s, v5.4s\n" + "smax v31.4s, v31.4s, v4.4s\n" + "smax v12.4s, v12.4s, v4.4s\n" + "smax v13.4s, v13.4s, v4.4s\n" + "smax v14.4s, v14.4s, v4.4s\n" + "smax v8.4s, v8.4s, v4.4s\n" + "smax v9.4s, v9.4s, v4.4s\n" + "smax v10.4s, v10.4s, v4.4s\n" + "smax v11.4s, v11.4s, v4.4s\n" + "smax v15.4s, v15.4s, v4.4s\n" + "smax v20.4s, v20.4s, v4.4s\n" + "smax v21.4s, v21.4s, v4.4s\n" + "smax v22.4s, v22.4s, v4.4s\n" + "smax v16.4s, v16.4s, v4.4s\n" + "smax v17.4s, v17.4s, v4.4s\n" + "smax v18.4s, v18.4s, v4.4s\n" + "smax v19.4s, v19.4s, v4.4s\n" + "smax v23.4s, v23.4s, v4.4s\n" + "smax v28.4s, v28.4s, v4.4s\n" + "smax v29.4s, v29.4s, v4.4s\n" + "smax v30.4s, v30.4s, v4.4s\n" + "smax v24.4s, v24.4s, v4.4s\n" + "smax v25.4s, v25.4s, v4.4s\n" + "smax v26.4s, v26.4s, v4.4s\n" + "smax v27.4s, v27.4s, v4.4s\n" "uzp1 v31.8h, v31.8h, v12.8h\n" - "uzp1 v12.8h, v13.8h, v14.8h\n" + "uzp1 v1.8h, v13.8h, v14.8h\n" "uzp1 v8.8h, v8.8h, v9.8h\n" - "uzp1 v9.8h, v10.8h, v11.8h\n" + "uzp1 v0.8h, v10.8h, v11.8h\n" "uzp1 v15.8h, v15.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v18.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "uzp1 v31.16b, v31.16b, v12.16b\n" - "uzp1 v8.16b, v8.16b, v9.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v31.16b, v31.16b, v1.16b\n" + "uzp1 v8.16b, v8.16b, v0.16b\n" "uzp1 v15.16b, v15.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v23.16b, v23.16b, v28.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v23.16b, v23.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 173f\n" "tbz x10, #3, 168f\n" "str d31, [x11], #0x8\n" - "str d8, [x24], #0x8\n" - "str d15, [x23], #0x8\n" - "str d16, [x22], #0x8\n" - "str d23, [x21], #0x8\n" - "str d24, [x20], #0x8\n" + "str d8, [x25], #0x8\n" + "str d15, [x24], #0x8\n" + "str d16, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d24, [x21], #0x8\n" "tbz x10, #2, 166f\n" "st1 { v31.s }[2], [x11], #0x4\n" - "st1 { v8.s }[2], [x24], #0x4\n" - "st1 { v15.s }[2], [x23], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v23.s }[2], [x21], #0x4\n" - "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v8.s }[2], [x25], #0x4\n" + "st1 { v15.s }[2], [x24], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "st1 { v24.s }[2], [x21], #0x4\n" "tbz x10, #1, 165f\n" "st1 { v31.h }[6], [x11], #0x2\n" - "st1 { v8.h }[6], [x24], #0x2\n" - "st1 { v15.h }[6], [x23], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v23.h }[6], [x21], #0x2\n" - "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v8.h }[6], [x25], #0x2\n" + "st1 { v15.h }[6], [x24], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v23.h }[6], [x22], #0x2\n" + "st1 { v24.h }[6], [x21], #0x2\n" "tbz x10, #0, 172f\n" "st1 { v31.b }[14], [x11]\n" - "st1 { v8.b }[14], [x24]\n" - "st1 { v15.b }[14], [x23]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v23.b }[14], [x21]\n" - "st1 { v24.b }[14], [x20]\n" + "st1 { v8.b }[14], [x25]\n" + "st1 { v15.b }[14], [x24]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v23.b }[14], [x22]\n" + "st1 { v24.b }[14], [x21]\n" "b 172f\n" "165:" // Height 6: Partial direct writeback: partial_1_12 "tbz x10, #0, 172f\n" "st1 { v31.b }[12], [x11]\n" - "st1 { v8.b }[12], [x24]\n" - "st1 { v15.b }[12], [x23]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v23.b }[12], [x21]\n" - "st1 { v24.b }[12], [x20]\n" + "st1 { v8.b }[12], [x25]\n" + "st1 { v15.b }[12], [x24]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v23.b }[12], [x22]\n" + "st1 { v24.b }[12], [x21]\n" "b 172f\n" "166:" // Height 6: Partial direct writeback: partial_2_8 "tbz x10, #1, 167f\n" "st1 { v31.h }[4], [x11], #0x2\n" - "st1 { v8.h }[4], [x24], #0x2\n" - "st1 { v15.h }[4], [x23], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v23.h }[4], [x21], #0x2\n" - "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v8.h }[4], [x25], #0x2\n" + "st1 { v15.h }[4], [x24], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v23.h }[4], [x22], #0x2\n" + "st1 { v24.h }[4], [x21], #0x2\n" "tbz x10, #0, 172f\n" "st1 { v31.b }[10], [x11]\n" - "st1 { v8.b }[10], [x24]\n" - "st1 { v15.b }[10], [x23]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v23.b }[10], [x21]\n" - "st1 { v24.b }[10], [x20]\n" + "st1 { v8.b }[10], [x25]\n" + "st1 { v15.b }[10], [x24]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v23.b }[10], [x22]\n" + "st1 { v24.b }[10], [x21]\n" "b 172f\n" "167:" // Height 6: Partial direct writeback: partial_1_8 "tbz x10, #0, 172f\n" "st1 { v31.b }[8], [x11]\n" - "st1 { v8.b }[8], [x24]\n" - "st1 { v15.b }[8], [x23]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v23.b }[8], [x21]\n" - "st1 { v24.b }[8], [x20]\n" + "st1 { v8.b }[8], [x25]\n" + "st1 { v15.b }[8], [x24]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v23.b }[8], [x22]\n" + "st1 { v24.b }[8], [x21]\n" "b 172f\n" "168:" // Height 6: Partial direct writeback: partial_4_0 "tbz x10, #2, 170f\n" "str s31, [x11], #0x4\n" - "str s8, [x24], #0x4\n" - "str s15, [x23], #0x4\n" - "str s16, [x22], #0x4\n" - "str s23, [x21], #0x4\n" - "str s24, [x20], #0x4\n" + "str s8, [x25], #0x4\n" + "str s15, [x24], #0x4\n" + "str s16, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "str s24, [x21], #0x4\n" "tbz x10, #1, 169f\n" "st1 { v31.h }[2], [x11], #0x2\n" - "st1 { v8.h }[2], [x24], #0x2\n" - "st1 { v15.h }[2], [x23], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v23.h }[2], [x21], #0x2\n" - "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v8.h }[2], [x25], #0x2\n" + "st1 { v15.h }[2], [x24], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v23.h }[2], [x22], #0x2\n" + "st1 { v24.h }[2], [x21], #0x2\n" "tbz x10, #0, 172f\n" "st1 { v31.b }[6], [x11]\n" - "st1 { v8.b }[6], [x24]\n" - "st1 { v15.b }[6], [x23]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v23.b }[6], [x21]\n" - "st1 { v24.b }[6], [x20]\n" + "st1 { v8.b }[6], [x25]\n" + "st1 { v15.b }[6], [x24]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v23.b }[6], [x22]\n" + "st1 { v24.b }[6], [x21]\n" "b 172f\n" "169:" // Height 6: Partial direct writeback: partial_1_4 "tbz x10, #0, 172f\n" "st1 { v31.b }[4], [x11]\n" - "st1 { v8.b }[4], [x24]\n" - "st1 { v15.b }[4], [x23]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v23.b }[4], [x21]\n" - "st1 { v24.b }[4], [x20]\n" + "st1 { v8.b }[4], [x25]\n" + "st1 { v15.b }[4], [x24]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v23.b }[4], [x22]\n" + "st1 { v24.b }[4], [x21]\n" "b 172f\n" "170:" // Height 6: Partial direct writeback: partial_2_0 "tbz x10, #1, 171f\n" "str h31, [x11], #0x2\n" - "str h8, [x24], #0x2\n" - "str h15, [x23], #0x2\n" - "str h16, [x22], #0x2\n" - "str h23, [x21], #0x2\n" - "str h24, [x20], #0x2\n" + "str h8, [x25], #0x2\n" + "str h15, [x24], #0x2\n" + "str h16, [x23], #0x2\n" + "str h23, [x22], #0x2\n" + "str h24, [x21], #0x2\n" "tbz x10, #0, 172f\n" "st1 { v31.b }[2], [x11]\n" - "st1 { v8.b }[2], [x24]\n" - "st1 { v15.b }[2], [x23]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v23.b }[2], [x21]\n" - "st1 { v24.b }[2], [x20]\n" + "st1 { v8.b }[2], [x25]\n" + "st1 { v15.b }[2], [x24]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v23.b }[2], [x22]\n" + "st1 { v24.b }[2], [x21]\n" "b 172f\n" "171:" // Height 6: Partial direct writeback: partial_1_0 "str b31, [x11, #0x0]\n" - "str b8, [x24, #0x0]\n" - "str b15, [x23, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b23, [x21, #0x0]\n" - "str b24, [x20, #0x0]\n" + "str b8, [x25, #0x0]\n" + "str b15, [x24, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b23, [x22, #0x0]\n" + "str b24, [x21, #0x0]\n" "172:" // Height 6: Partial direct writeback: Done "b 174f\n" "173:" // Height 6: Full writeback "str q31, [x11, #0x0]\n" "add x11, x11, #0x10\n" - "str q8, [x24, #0x0]\n" - "str q15, [x23, #0x0]\n" - "str q16, [x22, #0x0]\n" - "str q23, [x21, #0x0]\n" - "str q24, [x20, #0x0]\n" + "str q8, [x25, #0x0]\n" + "str q15, [x24, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q23, [x22, #0x0]\n" + "str q24, [x21, #0x0]\n" "174:" // Height 6: Writeback done "subs x10, x10, #0x10\n" "bgt 147b\n" @@ -3617,7 +3616,6 @@ void a64_hybrid_s8qs_mmla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "176:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp index 48ce67613e..a02fbe8f28 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -79,12 +79,12 @@ public: switch (ci->get_cpu_model()) { default: return { 31.65 }; - case CPUModel::A55r1: - return { 9.217 }; case CPUModel::A510: return { 15.87 }; case CPUModel::V1: return { 54.50 }; + case CPUModel::A55r1: + return { 9.217 }; } } @@ -97,7 +97,7 @@ public: case CPUModel::A510: return { 16.66, 3.92, 0.48 }; case CPUModel::V1: - return { 55.40, 19.21, 0.93 }; + return { 42.62, 16.32, 0.83 }; } } @@ -121,5 +121,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp index 8046b2ebb0..289d38c3b6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp @@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 171f\n" @@ -165,11 +164,11 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 15f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" "cbnz x15, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" @@ -186,129 +185,129 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "blt 18f\n" "17:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x16, #0x20]\n" - "ldr x12, [x16, #0x28]\n" + "ldr d17, [x16, #0x20]\n" + "ldr x20, [x16, #0x28]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x38]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x78]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xb8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xf8]\n" - "mov v7.d[1], x11\n" + "ldr d16, [x16, #0x30]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x38]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr d17, [x16, #0x40]\n" + "ldr x20, [x16, #0x48]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr d16, [x16, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x16, #0x60]\n" + "ldr x20, [x16, #0x68]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x16, #0x70]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x78]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x16, #0x80]\n" + "ldr x20, [x16, #0x88]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x16, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x16, #0xa0]\n" + "ldr x20, [x16, #0xa8]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x16, #0xb0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xb8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x16, #0xc0]\n" + "ldr x20, [x16, #0xc8]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x16, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr d17, [x16, #0xe0]\n" + "ldr x20, [x16, #0xe8]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr d16, [x16, #0xf0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xf8]\n" + "mov v16.d[1], x20\n" "add x13, x13, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "ldr d6, [x16, #0x0]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr x20, [x16, #0x8]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x18]\n" - "mov v0.d[1], x10\n" - "mov v7.d[1], x11\n" + "ldr x21, [x13, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x13, #0x80]\n" "bge 17b\n" "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q17, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x16, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x16, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x16, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x16, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x16, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x16, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x16, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x16, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x16, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x16, #0xf0]\n" "add x13, x13, #0x10\n" "sub x14, x14, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" "19:" // Height 1: Multiply loop: Main loop skip "cbz x14, 24f\n" "cmp x14, #0x4\n" "blt 21f\n" "20:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s18, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q16, [x16, #0x0]\n" + ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" + "ldr q17, [x16, #0x20]\n" "cmp x14, #0x4\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" "add x16, x16, #0x40\n" "bge 20b\n" "21:" // Height 1: Multiply loop: Skip odd blocks @@ -321,14 +320,14 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x0]\n" + ".inst 0x4f80e208 // sdot v8.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x20]\n" + ".inst 0x4f80e20a // sdot v10.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -499,226 +498,226 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" "cbnz x15, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" + "add x12, x12, x20\n" "b 50f\n" "49:" // Height 2: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" + "add x12, x13, x21\n" "50:" // Height 2: input setup done "cmp x14, #0x10\n" "blt 53f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" + "ldr q1, [x12, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 52f\n" "51:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d17, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v6.d[1], x12\n" + "ldr d16, [x16, #0x30]\n" + "mov v17.d[1], x21\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr d17, [x16, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr x20, [x16, #0x48]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr d16, [x16, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x16, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x16, #0x70]\n" + "mov v17.d[1], x21\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x16, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr x20, [x16, #0x88]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x16, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x16, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x16, #0xb0]\n" + "mov v17.d[1], x21\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "mov v16.d[1], x20\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x16, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr x20, [x16, #0xc8]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x16, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr d17, [x16, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr d16, [x16, #0xf0]\n" + "mov v17.d[1], x21\n" "add x13, x13, #0x10\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" + "mov v16.d[1], x20\n" + "add x12, x12, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" "ldr d6, [x16, #0x0]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x28, [x9, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x16, #0x18]\n" - "mov v1.d[1], x28\n" + "ldr x20, [x13, #0x8]\n" + "mov v6.d[1], x21\n" + "ldr x21, [x12, #0x8]\n" + "mov v0.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v1.d[1], x21\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v7.d[1], x11\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v7.d[1], x20\n" + "prfm pldl1keep, [x12, #0x80]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q17, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" "sub x14, x14, #0x10\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x16, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x16, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x16, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x16, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x16, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x16, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x16, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x16, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x16, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x16, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x16, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x16, #0xf0]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x14, 58f\n" "cmp x14, #0x4\n" "blt 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s19, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s18, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x0]\n" + ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x16, #0x20]\n" + ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" "bge 54b\n" "55:" // Height 2: Multiply loop: Skip odd blocks "cbz x14, 58f\n" "tbz x14, #1, 56f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" + "ldr h1, [x12], #0x2\n" "tbz x14, #0, 57f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x12]\n" "b 57f\n" "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" + "ldr b1, [x12, #0x0]\n" "57:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x0]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x16, #0x20]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" "58:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -936,281 +935,281 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 83f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" "cbnz x15, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" "b 84f\n" "83:" // Height 3: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" "84:" // Height 3: input setup done "cmp x14, #0x10\n" "blt 87f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 86f\n" "85:" // Height 3: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d21, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v21.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" + "ldr d20, [x16, #0x30]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr d21, [x16, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr d20, [x16, #0x50]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x16, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x16, #0x70]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x16, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x16, #0x90]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x16, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x16, #0xb0]\n" + "mov v20.d[1], x20\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x16, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x16, #0xd0]\n" + "mov v20.d[1], x20\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr d21, [x16, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + "mov v21.d[1], x21\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" "add x13, x13, #0x10\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr d20, [x16, #0xf0]\n" + "mov v20.d[1], x20\n" + "add x12, x12, #0x10\n" + "add x11, x11, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + "ldr x20, [x16, #0x8]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + "ldr x23, [x13, #0x8]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + "ldr x22, [x12, #0x8]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x26, [x27, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x18]\n" - "mov v0.d[1], x10\n" + "ldr x21, [x11, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v0.d[1], x23\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v1.d[1], x28\n" - "prfm pldl1keep, [x9, #0x80]\n" - "mov v2.d[1], x26\n" - "prfm pldl1keep, [x27, #0x80]\n" - "mov v7.d[1], x11\n" + "mov v1.d[1], x22\n" + "prfm pldl1keep, [x12, #0x80]\n" + "mov v2.d[1], x21\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v7.d[1], x20\n" "bge 85b\n" "86:" // Height 3: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q21, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x16, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x16, #0x50]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x16, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x16, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x16, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x16, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x16, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x16, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x16, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x16, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x16, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x16, #0xf0]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "87:" // Height 3: Multiply loop: Main loop skip "cbz x14, 92f\n" "cmp x14, #0x4\n" "blt 89f\n" "88:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s24, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s23, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s22, [x11], #0x4\n" + "ldr q21, [x16, #0x0]\n" + ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" + "ldr q20, [x16, #0x10]\n" + ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x16, #0x20]\n" + ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" "bge 88b\n" "89:" // Height 3: Multiply loop: Skip odd blocks "cbz x14, 92f\n" "tbz x14, #1, 90f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" "tbz x14, #0, 91f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" "b 91f\n" "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" "91:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q21, [x16, #0x0]\n" + ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" + "ldr q20, [x16, #0x10]\n" + ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x16, #0x20]\n" + ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" "92:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -1475,336 +1474,336 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 117f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" "cbnz x15, 118f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "b 118f\n" "117:" // Height 4: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" "118:" // Height 4: input setup done "cmp x14, #0x10\n" "blt 121f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 120f\n" "119:" // Height 4: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d25, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v25.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d24, [x16, #0x30]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + "add x11, x11, #0x10\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr d25, [x16, #0x40]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr d24, [x16, #0x50]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + "ldr x25, [x13, #0x8]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x16, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + "ldr x24, [x12, #0x8]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x16, #0x70]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + "ldr x23, [x11, #0x8]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x16, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + "ldr x22, [x10, #0x8]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x16, #0x90]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" "sub x14, x14, #0x10\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x16, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" "cmp x14, #0x20\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x16, #0xb0]\n" + "mov v24.d[1], x20\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x16, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x16, #0xd0]\n" + "mov v24.d[1], x20\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr d25, [x16, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + "mov v25.d[1], x21\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr d24, [x16, #0xf0]\n" + "mov v24.d[1], x20\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x16, #0x18]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" + "ldr d3, [x10, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v7.d[1], x11\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x24\n" + "mov v2.d[1], x23\n" + "mov v3.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 119b\n" "120:" // Height 4: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q25, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x16, #0x40]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x16, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x16, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x16, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x16, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x16, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x16, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x16, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x16, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x16, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x16, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x16, #0xf0]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "121:" // Height 4: Multiply loop: Main loop skip "cbz x14, 126f\n" "cmp x14, #0x4\n" "blt 123f\n" "122:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s29, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s28, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s27, [x11], #0x4\n" + "ldr s26, [x10], #0x4\n" + "ldr q25, [x16, #0x0]\n" + ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" + "ldr q24, [x16, #0x10]\n" + ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x16, #0x20]\n" + ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" "bge 122b\n" "123:" // Height 4: Multiply loop: Skip odd blocks "cbz x14, 126f\n" "tbz x14, #1, 124f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" "tbz x14, #0, 125f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" "b 125f\n" "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" "125:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q25, [x16, #0x0]\n" + ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" + "ldr q24, [x16, #0x10]\n" + ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x16, #0x20]\n" + ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" "126:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -2108,399 +2107,399 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "149:" // Height 5: setup done - "mov x15, #0x0\n" - "150:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 151f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "cbnz x15, 152f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "b 152f\n" - "151:" // Height 5: setup direct input - "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "152:" // Height 5: input setup done - "cmp x14, #0x10\n" - "blt 155f\n" - "ldr q0, [x13, #0x0]\n" - "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q6, [x16, #0x0]\n" - "ldr q7, [x16, #0x10]\n" - "blt 154f\n" - "153:" // Height 5: Multiply loop: Main loop head - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x10\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr x22, [x23, #0x8]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "sub x14, x14, #0x10\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "cmp x14, #0x20\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x15, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w14, [x20, x15, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "cbnz x15, 152f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x13, x13, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" + "add x9, x9, x20\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x13, %x[input_ptr]\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "152:" // Height 5: input setup done + "cmp x14, #0x10\n" + "blt 155f\n" + "ldr q0, [x13, #0x0]\n" + "cmp x14, #0x20\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr x21, [x16, #0x28]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x38]\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + "add x13, x13, #0x10\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "add x12, x12, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr d29, [x16, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x48]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "add x11, x11, #0x10\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr d28, [x16, #0x30]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + "add x9, x9, #0x10\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + "ldr x26, [x13, #0x8]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr d29, [x16, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + "ldr x25, [x12, #0x8]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr d28, [x16, #0x50]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + "ldr x22, [x9, #0x8]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x16, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + "sub x14, x14, #0x10\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + "cmp x14, #0x20\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x16, #0x70]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + "prfm pldl1keep, [x13, #0x80]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x16, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x16, #0x90]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x16, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x16, #0xb0]\n" + "mov v28.d[1], x20\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x16, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x16, #0xd0]\n" + "mov v28.d[1], x20\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr d29, [x16, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + "mov v29.d[1], x21\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr d28, [x16, #0xf0]\n" + "mov v28.d[1], x20\n" "add x16, x16, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x16, #0x18]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + "ldr d3, [x10, #0x0]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" + "ldr d4, [x9, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x26\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" "mov v4.d[1], x22\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "bge 153b\n" "154:" // Height 5: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q29, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x16, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x16, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x16, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x16, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x16, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x16, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x16, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x16, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x16, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x16, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x16, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x16, #0xf0]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" "155:" // Height 5: Multiply loop: Main loop skip "cbz x14, 160f\n" "cmp x14, #0x4\n" "blt 157f\n" "156:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s2, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s1, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s0, [x11], #0x4\n" + "ldr s31, [x10], #0x4\n" + "ldr s30, [x9], #0x4\n" + "ldr q29, [x16, #0x0]\n" + ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" + "ldr q28, [x16, #0x10]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x16, #0x20]\n" + ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" "bge 156b\n" "157:" // Height 5: Multiply loop: Skip odd blocks "cbz x14, 160f\n" "tbz x14, #1, 158f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" + "ldr h4, [x9], #0x2\n" "tbz x14, #0, 159f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" + "ld1 { v4.b }[2], [x9]\n" "b 159f\n" "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" + "ldr b4, [x9, #0x0]\n" "159:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q29, [x16, #0x0]\n" + ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" + "ldr q28, [x16, #0x10]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x16, #0x20]\n" + ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" "160:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -2862,98 +2861,98 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 185f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "ldr x28, [x20, #0x28]\n" "cbnz x15, 186f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "add x21, x21, x20\n" + "add x28, x28, x20\n" "b 186f\n" "185:" // Height 6: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "add x21, x23, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "add x28, x9, x21\n" "186:" // Height 6: input setup done "cmp x14, #0x10\n" "blt 189f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x21, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x28, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 188f\n" "187:" // Height 6: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" + "ldr x20, [x16, #0x58]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr x10, [x13, #0x8]\n" + "ldr x27, [x13, #0x8]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x26, [x12, #0x8]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr x26, [x27, #0x8]\n" + "ldr x25, [x11, #0x8]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x16, #0x40]\n" ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" + "ldr x21, [x16, #0x68]\n" ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x24, [x10, #0x8]\n" ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x23, [x9, #0x8]\n" ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr x20, [x21, #0x8]\n" + "ldr x22, [x28, #0x8]\n" ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" + "ldr x20, [x16, #0x78]\n" ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" "sub x14, x14, #0x10\n" ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" @@ -2963,96 +2962,96 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x16, #0x60]\n" ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" + "ldr x21, [x16, #0x88]\n" ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" + "ldr x20, [x16, #0x98]\n" ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x16, #0x80]\n" ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" + "ldr x21, [x16, #0xa8]\n" ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" + "ldr x20, [x16, #0xb8]\n" ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x16, #0xa0]\n" ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" + "ldr x21, [x16, #0xc8]\n" ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" + "ldr x20, [x16, #0xd8]\n" ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x16, #0xc0]\n" ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" + "ldr x21, [x16, #0xe8]\n" ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" + "ldr x20, [x16, #0xf8]\n" ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n" "ldr d6, [x16, #0xe0]\n" ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n" "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "add x16, x16, #0x100\n" ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" + "ldr x21, [x16, #0x8]\n" ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" + "ldr x20, [x16, #0x18]\n" ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" @@ -3061,56 +3060,56 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x12, #0x0]\n" ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x11, #0x0]\n" ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d3, [x10, #0x0]\n" ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" + "ldr d4, [x9, #0x0]\n" ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n" - "ldr d5, [x21, #0x0]\n" + "ldr d5, [x28, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x26\n" + "mov v2.d[1], x25\n" "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v5.d[1], x20\n" - "mov v7.d[1], x11\n" + "mov v4.d[1], x23\n" + "mov v5.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 187b\n" "188:" // Height 6: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" "ldr q6, [x16, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" "ldr q7, [x16, #0x30]\n" ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" @@ -3210,98 +3209,98 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "cmp x14, #0x4\n" "blt 191f\n" "190:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s7, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s6, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s5, [x11], #0x4\n" + "ldr s4, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr s2, [x28], #0x4\n" + "ldr q1, [x16, #0x0]\n" + ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" + "ldr q0, [x16, #0x10]\n" + ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x16, #0x20]\n" + ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x16, #0x30]\n" + ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" "bge 190b\n" "191:" // Height 6: Multiply loop: Skip odd blocks "cbz x14, 194f\n" "tbz x14, #1, 192f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" - "ldr h5, [x21], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" + "ldr h4, [x9], #0x2\n" + "ldr h5, [x28], #0x2\n" "tbz x14, #0, 193f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" - "ld1 { v5.b }[2], [x21]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" + "ld1 { v4.b }[2], [x9]\n" + "ld1 { v5.b }[2], [x28]\n" "b 193f\n" "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" - "ldr b5, [x21, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" + "ldr b4, [x9, #0x0]\n" + "ldr b5, [x28, #0x0]\n" "193:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x0]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x16, #0x10]\n" + ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x16, #0x20]\n" + ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x16, #0x30]\n" + ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" "194:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -3488,7 +3487,6 @@ void a64_hybrid_s8s32_dot_6x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "206:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp index ddf776107a..452d647bb4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp @@ -77,7 +77,6 @@ void a64_hybrid_s8s32_dot_6x16 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 171f\n" @@ -165,11 +164,11 @@ void a64_hybrid_s8s32_dot_6x16 ( "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 15f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -186,37 +185,37 @@ void a64_hybrid_s8s32_dot_6x16 ( "blt 18f\n" "17:" // Height 1: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x20\n" "add x10, x10, #0x100\n" @@ -226,37 +225,37 @@ void a64_hybrid_s8s32_dot_6x16 ( "bge 17b\n" "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "19:" // Height 1: Multiply loop: Main loop skip @@ -264,17 +263,17 @@ void a64_hybrid_s8s32_dot_6x16 ( "cmp x27, #0x4\n" "blt 21f\n" "20:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x10, #0x0]\n" + ".inst 0x4f92e208 // sdot v8.4s, v16.16b, v18.4b[0]\n" "sub x27, x27, #0x4\n" - "ldr q7, [x10, #0x10]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x10]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n" "cmp x27, #0x4\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f92e22a // sdot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x4f92e20b // sdot v11.4s, v16.16b, v18.4b[0]\n" "add x10, x10, #0x40\n" "bge 20b\n" "21:" // Height 1: Multiply loop: Skip odd blocks @@ -287,14 +286,14 @@ void a64_hybrid_s8s32_dot_6x16 ( "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" "add x10, x10, #0x40\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -465,12 +464,12 @@ void a64_hybrid_s8s32_dot_6x16 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -478,7 +477,7 @@ void a64_hybrid_s8s32_dot_6x16 ( "b 50f\n" "49:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "50:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 53f\n" @@ -491,137 +490,137 @@ void a64_hybrid_s8s32_dot_6x16 ( "51:" // Height 2: Multiply loop: Main loop head ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "sub x27, x27, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x26, x26, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x40]\n" "add x25, x25, #0x10\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x50]\n" "cmp x27, #0x20\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "add x26, x26, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x25, x25, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x40]\n" "sub x27, x27, #0x10\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x4fa0e228 // sdot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22c // sdot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4fa0e209 // sdot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20d // sdot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x4fa0e22a // sdot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa1e22e // sdot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4fa0e20b // sdot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x4fa1e20f // sdot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x4f80ea28 // sdot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2c // sdot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x4f80ea09 // sdot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0d // sdot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x4f80ea2a // sdot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f81ea2e // sdot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x4f80ea0b // sdot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x4f81ea0f // sdot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x4fa0ea28 // sdot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2c // sdot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x4fa0ea09 // sdot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0d // sdot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa0ea2a // sdot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa1ea2e // sdot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x4fa0ea0b // sdot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x4fa1ea0f // sdot v15.4s, v16.16b, v1.4b[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x27, 58f\n" "cmp x27, #0x4\n" "blt 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f93e228 // sdot v8.4s, v17.16b, v19.4b[0]\n" + ".inst 0x4f92e22c // sdot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f93e209 // sdot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20d // sdot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f93e22a // sdot v10.4s, v17.16b, v19.4b[0]\n" + ".inst 0x4f92e22e // sdot v14.4s, v17.16b, v18.4b[0]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f93e20b // sdot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x4f92e20f // sdot v15.4s, v16.16b, v18.4b[0]\n" "bge 54b\n" "55:" // Height 2: Multiply loop: Skip odd blocks "cbz x27, 58f\n" @@ -636,19 +635,19 @@ void a64_hybrid_s8s32_dot_6x16 ( "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "57:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x4f80e228 // sdot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22c // sdot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4f80e209 // sdot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20d // sdot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x4f80e22a // sdot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f81e22e // sdot v14.4s, v17.16b, v1.4b[0]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e20b // sdot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x4f81e20f // sdot v15.4s, v16.16b, v1.4b[0]\n" "58:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -866,13 +865,13 @@ void a64_hybrid_s8s32_dot_6x16 ( "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 83f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -881,8 +880,8 @@ void a64_hybrid_s8s32_dot_6x16 ( "b 84f\n" "83:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "84:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 87f\n" @@ -899,75 +898,75 @@ void a64_hybrid_s8s32_dot_6x16 ( "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x25, x25, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "add x24, x24, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 85b\n" @@ -977,98 +976,98 @@ void a64_hybrid_s8s32_dot_6x16 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x24, x24, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "sub x27, x27, #0x10\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x50]\n" + ".inst 0x4fa0e2a8 // sdot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ac // sdot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b0 // sdot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x4fa0e289 // sdot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28d // sdot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e291 // sdot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x4fa0e2aa // sdot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x4fa1e2ae // sdot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x4fa2e2b2 // sdot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x4fa0e28b // sdot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4fa1e28f // sdot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x4fa2e293 // sdot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x4f80eaa8 // sdot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaac // sdot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab0 // sdot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x4f80ea89 // sdot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8d // sdot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea91 // sdot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x4f80eaaa // sdot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x4f81eaae // sdot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x4f82eab2 // sdot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x4f80ea8b // sdot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4f81ea8f // sdot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x4f82ea93 // sdot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaac // sdot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab0 // sdot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x4fa0ea89 // sdot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8d // sdot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea91 // sdot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa0eaaa // sdot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x4fa1eaae // sdot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x4fa2eab2 // sdot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa0ea8b // sdot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4fa1ea8f // sdot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x4fa2ea93 // sdot v19.4s, v20.16b, v2.4b[3]\n" "87:" // Height 3: Multiply loop: Main loop skip "cbz x27, 92f\n" "cmp x27, #0x4\n" "blt 89f\n" "88:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x10, #0x0]\n" + ".inst 0x4f98e2a8 // sdot v8.4s, v21.16b, v24.4b[0]\n" + ".inst 0x4f97e2ac // sdot v12.4s, v21.16b, v23.4b[0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x4f96e2b0 // sdot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x4f98e289 // sdot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28d // sdot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e291 // sdot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f98e2aa // sdot v10.4s, v21.16b, v24.4b[0]\n" + ".inst 0x4f97e2ae // sdot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x4f96e2b2 // sdot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x4f98e28b // sdot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x4f97e28f // sdot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x4f96e293 // sdot v19.4s, v20.16b, v22.4b[0]\n" "bge 88b\n" "89:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 92f\n" @@ -1086,23 +1085,23 @@ void a64_hybrid_s8s32_dot_6x16 ( "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "91:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q21, [x10, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x4f80e2a8 // sdot v8.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ac // sdot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b0 // sdot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x4f80e289 // sdot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28d // sdot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e291 // sdot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f80e2aa // sdot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x4f81e2ae // sdot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x4f82e2b2 // sdot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x4f80e28b // sdot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4f81e28f // sdot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x4f82e293 // sdot v19.4s, v20.16b, v2.4b[0]\n" "92:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1367,14 +1366,14 @@ void a64_hybrid_s8s32_dot_6x16 ( "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 117f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 118f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1384,9 +1383,9 @@ void a64_hybrid_s8s32_dot_6x16 ( "b 118f\n" "117:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "118:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 121f\n" @@ -1405,7 +1404,7 @@ void a64_hybrid_s8s32_dot_6x16 ( "add x26, x26, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x25, x25, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1413,85 +1412,85 @@ void a64_hybrid_s8s32_dot_6x16 ( "add x23, x23, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "cmp x27, #0x20\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 119b\n" @@ -1502,7 +1501,7 @@ void a64_hybrid_s8s32_dot_6x16 ( "add x25, x25, #0x10\n" ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x24, x24, #0x10\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -1510,112 +1509,112 @@ void a64_hybrid_s8s32_dot_6x16 ( "sub x27, x27, #0x10\n" ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x4fa0e328 // sdot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32c // sdot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e330 // sdot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e334 // sdot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x4fa0e309 // sdot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30d // sdot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e311 // sdot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e315 // sdot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x4fa0e32a // sdot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4fa1e32e // sdot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x4fa2e332 // sdot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x4fa3e336 // sdot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x4fa0e30b // sdot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x4fa1e30f // sdot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa2e313 // sdot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x4fa3e317 // sdot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x4f80eb28 // sdot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2c // sdot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb30 // sdot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb34 // sdot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0d // sdot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb11 // sdot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb15 // sdot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x4f80eb2a // sdot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4f81eb2e // sdot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x4f82eb32 // sdot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x4f83eb36 // sdot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x4f80eb0b // sdot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x4f81eb0f // sdot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f82eb13 // sdot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x4f83eb17 // sdot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x4fa0eb28 // sdot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2c // sdot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb30 // sdot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb34 // sdot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x4fa0eb09 // sdot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0d // sdot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb11 // sdot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb15 // sdot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa0eb2a // sdot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4fa1eb2e // sdot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x4fa2eb32 // sdot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x4fa3eb36 // sdot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x4fa0eb0b // sdot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x4fa1eb0f // sdot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x4fa2eb13 // sdot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x4fa3eb17 // sdot v23.4s, v24.16b, v3.4b[3]\n" "121:" // Height 4: Multiply loop: Main loop skip "cbz x27, 126f\n" "cmp x27, #0x4\n" "blt 123f\n" "122:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x4f9de328 // sdot v8.4s, v25.16b, v29.4b[0]\n" + ".inst 0x4f9ce32c // sdot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be330 // sdot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae334 // sdot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x4f9de309 // sdot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30d // sdot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be311 // sdot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae315 // sdot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f9de32a // sdot v10.4s, v25.16b, v29.4b[0]\n" + ".inst 0x4f9ce32e // sdot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x4f9be332 // sdot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x4f9ae336 // sdot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x4f9de30b // sdot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x4f9ce30f // sdot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x4f9be313 // sdot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x4f9ae317 // sdot v23.4s, v24.16b, v26.4b[0]\n" "bge 122b\n" "123:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 126f\n" @@ -1636,27 +1635,27 @@ void a64_hybrid_s8s32_dot_6x16 ( "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" "125:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x4f80e328 // sdot v8.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32c // sdot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e330 // sdot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e334 // sdot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x4f80e309 // sdot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30d // sdot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e311 // sdot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e315 // sdot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f80e32a // sdot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4f81e32e // sdot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x4f82e332 // sdot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x4f83e336 // sdot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x4f80e30b // sdot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x4f81e30f // sdot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f82e313 // sdot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x4f83e317 // sdot v23.4s, v24.16b, v3.4b[0]\n" "126:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1960,162 +1959,162 @@ void a64_hybrid_s8s32_dot_6x16 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "149:" // Height 5: setup done - "mov x28, #0x0\n" - "150:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 151f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 152f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20\n" - "add x25, x25, x20\n" - "add x24, x24, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" - "b 152f\n" - "151:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "152:" // Height 5: input setup done - "cmp x27, #0x10\n" - "blt 155f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x20\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "blt 154f\n" - "153:" // Height 5: Multiply loop: Main loop head - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "sub x27, x27, #0x10\n" - "add x26, x26, #0x10\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x20\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x28, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 152f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20\n" + "add x25, x25, x20\n" + "add x24, x24, x20\n" + "add x23, x23, x20\n" + "add x22, x22, x20\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "152:" // Height 5: input setup done + "cmp x27, #0x10\n" + "blt 155f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" + "sub x27, x27, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x20\n" + ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 153b\n" @@ -2129,7 +2128,7 @@ void a64_hybrid_s8s32_dot_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q29, [x10, #0x20]\n" ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" @@ -2138,131 +2137,131 @@ void a64_hybrid_s8s32_dot_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q28, [x10, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x4fa0e3a8 // sdot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ac // sdot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b0 // sdot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b4 // sdot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3b8 // sdot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x4fa0e389 // sdot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38d // sdot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e391 // sdot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e395 // sdot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x4fa0e3aa // sdot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4fa1e3ae // sdot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x4fa2e3b2 // sdot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x4fa3e3b6 // sdot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x4fa4e3ba // sdot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x4fa0e38b // sdot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x4fa1e38f // sdot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x4fa2e393 // sdot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x4fa3e397 // sdot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x4fa4e39b // sdot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x4f80eba8 // sdot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebac // sdot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb0 // sdot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb4 // sdot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebb8 // sdot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x4f80eb89 // sdot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8d // sdot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb91 // sdot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb95 // sdot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb99 // sdot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x4f80ebaa // sdot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4f81ebae // sdot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x4f82ebb2 // sdot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x4f83ebb6 // sdot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x4f84ebba // sdot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x4f80eb8b // sdot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x4f81eb8f // sdot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x4f82eb93 // sdot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x4f83eb97 // sdot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x4f84eb9b // sdot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x4fa0eba8 // sdot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebac // sdot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb0 // sdot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb4 // sdot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebb8 // sdot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x4fa0eb89 // sdot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8d // sdot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb91 // sdot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb95 // sdot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb99 // sdot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x4fa0ebaa // sdot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4fa1ebae // sdot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x4fa2ebb2 // sdot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x4fa3ebb6 // sdot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x4fa4ebba // sdot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x4fa0eb8b // sdot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x4fa1eb8f // sdot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x4fa2eb93 // sdot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x4fa3eb97 // sdot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x4fa4eb9b // sdot v27.4s, v28.16b, v4.4b[3]\n" "155:" // Height 5: Multiply loop: Main loop skip "cbz x27, 160f\n" "cmp x27, #0x4\n" "blt 157f\n" "156:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x10, #0x0]\n" + ".inst 0x4f82e3a8 // sdot v8.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x4f80e3b0 // sdot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b4 // sdot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3b8 // sdot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f82e389 // sdot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e391 // sdot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe395 // sdot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee399 // sdot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f82e3aa // sdot v10.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f80e3b2 // sdot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f9fe3b6 // sdot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4f9ee3ba // sdot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x4f82e38b // sdot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f80e393 // sdot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f9fe397 // sdot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x4f9ee39b // sdot v27.4s, v28.16b, v30.4b[0]\n" "bge 156b\n" "157:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 160f\n" @@ -2286,31 +2285,31 @@ void a64_hybrid_s8s32_dot_6x16 ( "ldr b3, [x23, #0x0]\n" "ldr b4, [x22, #0x0]\n" "159:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q29, [x10, #0x0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x4f80e3a8 // sdot v8.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ac // sdot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b0 // sdot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b4 // sdot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3b8 // sdot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x4f80e389 // sdot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38d // sdot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e391 // sdot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e395 // sdot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e399 // sdot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f80e3aa // sdot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4f81e3ae // sdot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x4f82e3b2 // sdot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x4f83e3b6 // sdot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x4f84e3ba // sdot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x4f80e38b // sdot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x4f81e38f // sdot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x4f82e393 // sdot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x4f83e397 // sdot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x4f84e39b // sdot v27.4s, v28.16b, v4.4b[0]\n" "160:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2672,16 +2671,16 @@ void a64_hybrid_s8s32_dot_6x16 ( "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 185f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 186f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2693,11 +2692,11 @@ void a64_hybrid_s8s32_dot_6x16 ( "b 186f\n" "185:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "186:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 189f\n" @@ -2976,43 +2975,43 @@ void a64_hybrid_s8s32_dot_6x16 ( "cmp x27, #0x4\n" "blt 191f\n" "190:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x4f87e028 // sdot v8.4s, v1.16b, v7.4b[0]\n" + ".inst 0x4f86e02c // sdot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e030 // sdot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e034 // sdot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e038 // sdot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03c // sdot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x4f87e009 // sdot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00d // sdot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e011 // sdot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e015 // sdot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e019 // sdot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01d // sdot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f87e02a // sdot v10.4s, v1.16b, v7.4b[0]\n" + ".inst 0x4f86e02e // sdot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4f85e032 // sdot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x4f84e036 // sdot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4f83e03a // sdot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4f82e03e // sdot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x4f87e00b // sdot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4f86e00f // sdot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x4f85e013 // sdot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x4f84e017 // sdot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x4f83e01b // sdot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x4f82e01f // sdot v31.4s, v0.16b, v2.4b[0]\n" "bge 190b\n" "191:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 194f\n" @@ -3039,35 +3038,35 @@ void a64_hybrid_s8s32_dot_6x16 ( "ldr b4, [x22, #0x0]\n" "ldr b5, [x21, #0x0]\n" "193:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ec // sdot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f0 // sdot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f4 // sdot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0f8 // sdot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fc // sdot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cd // sdot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d1 // sdot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d5 // sdot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0d9 // sdot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0dd // sdot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4f81e0ee // sdot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4f82e0f2 // sdot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4f83e0f6 // sdot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4f84e0fa // sdot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4f85e0fe // sdot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x4f80e0cb // sdot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x4f81e0cf // sdot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x4f82e0d3 // sdot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x4f83e0d7 // sdot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x4f84e0db // sdot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x4f85e0df // sdot v31.4s, v6.16b, v5.4b[0]\n" "194:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3254,7 +3253,6 @@ void a64_hybrid_s8s32_dot_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "206:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp index 50ccb6fa3d..4905ba5656 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -92,7 +92,7 @@ public: case CPUModel::A510: return { 33.62, 3.92, 0.48 }; case CPUModel::V1: - return { 86.36, 19.25, 0.92 }; + return { 63.94, 16.18, 0.83 }; } } @@ -109,5 +109,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp index f48623e129..f8a76b5244 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp @@ -77,7 +77,6 @@ void a64_hybrid_s8s32_mmla_6x16 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 186f\n" @@ -178,11 +177,11 @@ void a64_hybrid_s8s32_mmla_6x16 ( "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -198,41 +197,41 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 19f\n" "18:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v19.2d, v1.2d, v20.2d\n" + ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v20.2d\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "cmp x27, #0x20\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n" + ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n" "ldr q1, [x26, #0x0]\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" @@ -240,40 +239,40 @@ void a64_hybrid_s8s32_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "bge 18b\n" "19:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v20.2d, v1.2d, v21.2d\n" + ".inst 0x4e87a688 // smmla v8.4s, v20.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x4e86a68c // smmla v12.4s, v20.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e92a689 // smmla v9.4s, v20.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e91a68d // smmla v13.4s, v20.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a68a // smmla v10.4s, v20.16b, v18.16b\n" + "ldr q19, [x10, #0x60]\n" + ".inst 0x4e91a68e // smmla v14.4s, v20.16b, v17.16b\n" + "ldr q18, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v21.2d\n" + ".inst 0x4e93a68b // smmla v11.4s, v20.16b, v19.16b\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x4e92a68f // smmla v15.4s, v20.16b, v18.16b\n" + "ldr q19, [x10, #0x90]\n" + ".inst 0x4e91a428 // smmla v8.4s, v1.16b, v17.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x4e93a42c // smmla v12.4s, v1.16b, v19.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n" + ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "20:" // Height 1: Multiply loop: Main loop skip @@ -281,26 +280,26 @@ void a64_hybrid_s8s32_mmla_6x16 ( "cmp x27, #0x8\n" "blt 22f\n" "21:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + "ldr d19, [x26], #0x8\n" + "ldr q18, [x10, #0x0]\n" + "trn1 v19.2d, v19.2d, v17.2d\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x4e92a668 // smmla v8.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x4e91a66c // smmla v12.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "bge 21b\n" "22:" // Height 1: Multiply loop: Skip odd blocks @@ -325,23 +324,23 @@ void a64_hybrid_s8s32_mmla_6x16 ( "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "26:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q23, [x10, #0x0]\n" + "ldr q18, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v17.2d\n" + ".inst 0x4e97a668 // smmla v8.4s, v19.16b, v23.16b\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x4e92a66c // smmla v12.4s, v19.16b, v18.16b\n" + "ldr q31, [x10, #0x30]\n" + ".inst 0x4e91a669 // smmla v9.4s, v19.16b, v17.16b\n" + "ldr q20, [x10, #0x40]\n" + ".inst 0x4e9fa66d // smmla v13.4s, v19.16b, v31.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e94a66a // smmla v10.4s, v19.16b, v20.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "27:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -525,12 +524,12 @@ void a64_hybrid_s8s32_mmla_6x16 ( "52:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 53f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 54f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -538,7 +537,7 @@ void a64_hybrid_s8s32_mmla_6x16 ( "b 54f\n" "53:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "54:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 57f\n" @@ -549,85 +548,85 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 56f\n" "55:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" + ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x10, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "bge 55b\n" "56:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x4e87a668 // smmla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x4e86a66c // smmla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e92a669 // smmla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x4e92a428 // smmla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x4e91a42c // smmla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x4e92a429 // smmla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x4e91a42d // smmla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x4e92a42a // smmla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x4e91a42e // smmla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e92a42b // smmla v11.4s, v1.16b, v18.16b\n" + ".inst 0x4e91a42f // smmla v15.4s, v1.16b, v17.16b\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" @@ -637,27 +636,27 @@ void a64_hybrid_s8s32_mmla_6x16 ( "cmp x27, #0x8\n" "blt 59f\n" "58:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d18, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "trn1 v19.2d, v18.2d, v17.2d\n" "sub x27, x27, #0x8\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x40]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x60]\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q22, [x10, #0x10]\n" + ".inst 0x4e91a668 // smmla v8.4s, v19.16b, v17.16b\n" + ".inst 0x4e96a66c // smmla v12.4s, v19.16b, v22.16b\n" + "ldr q1, [x10, #0x20]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x4e81a669 // smmla v9.4s, v19.16b, v1.16b\n" + ".inst 0x4e91a66d // smmla v13.4s, v19.16b, v17.16b\n" + "ldr q18, [x10, #0x40]\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q18, [x10, #0x60]\n" + "ldr q17, [x10, #0x70]\n" "cmp x27, #0x8\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "bge 58b\n" "59:" // Height 2: Multiply loop: Skip odd blocks @@ -689,23 +688,23 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "63:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x4e92a668 // smmla v8.4s, v19.16b, v18.16b\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x4e91a66c // smmla v12.4s, v19.16b, v17.16b\n" + "ldr q21, [x10, #0x30]\n" + ".inst 0x4e85a669 // smmla v9.4s, v19.16b, v5.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x4e95a66d // smmla v13.4s, v19.16b, v21.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x4e92a66a // smmla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x4e91a66e // smmla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x4e92a66b // smmla v11.4s, v19.16b, v18.16b\n" + ".inst 0x4e91a66f // smmla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "64:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -953,13 +952,13 @@ void a64_hybrid_s8s32_mmla_6x16 ( "89:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 90f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 91f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -968,8 +967,8 @@ void a64_hybrid_s8s32_mmla_6x16 ( "b 91f\n" "90:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "91:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 94f\n" @@ -981,167 +980,167 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 93f\n" "92:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n" + ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" "cmp x27, #0x20\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 92b\n" "93:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n" + ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n" + ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n" "94:" // Height 3: Multiply loop: Main loop skip "cbz x27, 101f\n" "cmp x27, #0x8\n" "blt 96f\n" "95:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr q26, [x10, #0x0]\n" + "trn1 v27.2d, v25.2d, v27.2d\n" + ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x8\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" "cmp x27, #0x8\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" "add x10, x10, #0x80\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" "bge 95b\n" "96:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 101f\n" @@ -1179,33 +1178,33 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "100:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q29, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v25.2d\n" + ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e9da78c // smmla v12.4s, v28.16b, v29.16b\n" + ".inst 0x4e9da774 // smmla v20.4s, v27.16b, v29.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" "101:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1499,14 +1498,14 @@ void a64_hybrid_s8s32_mmla_6x16 ( "126:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 127f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 128f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1516,9 +1515,9 @@ void a64_hybrid_s8s32_mmla_6x16 ( "b 128f\n" "127:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "128:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 131f\n" @@ -1531,173 +1530,173 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 130f\n" "129:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n" "sub x27, x27, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n" + ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" "add x23, x23, #0x10\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n" "cmp x27, #0x20\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" + ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 129b\n" "130:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a788 // smmla v8.4s, v28.16b, v7.16b\n" "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x4e87a770 // smmla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e86a78c // smmla v12.4s, v28.16b, v6.16b\n" + ".inst 0x4e86a774 // smmla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" "add x24, x24, #0x10\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" "add x23, x23, #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x4e9aa428 // smmla v8.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" + ".inst 0x4e9aa470 // smmla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x4e99a42c // smmla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" + ".inst 0x4e99a474 // smmla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x4e9aa429 // smmla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x4e9aa471 // smmla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x4e99a42d // smmla v13.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a475 // smmla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x4e9aa42a // smmla v10.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa472 // smmla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x4e99a42e // smmla v14.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a476 // smmla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e9aa42b // smmla v11.4s, v1.16b, v26.16b\n" + ".inst 0x4e9aa473 // smmla v19.4s, v3.16b, v26.16b\n" + ".inst 0x4e99a42f // smmla v15.4s, v1.16b, v25.16b\n" + ".inst 0x4e99a477 // smmla v23.4s, v3.16b, v25.16b\n" "131:" // Height 4: Multiply loop: Main loop skip "cbz x27, 138f\n" "cmp x27, #0x8\n" "blt 133f\n" "132:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "trn1 v27.2d, v26.2d, v25.2d\n" "cmp x27, #0x8\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" "bge 132b\n" "133:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 138f\n" @@ -1742,33 +1741,33 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" "137:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x4e9aa788 // smmla v8.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa770 // smmla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x4e99a78c // smmla v12.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a774 // smmla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x4e9aa789 // smmla v9.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa771 // smmla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x4e99a78d // smmla v13.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a775 // smmla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x4e9aa78a // smmla v10.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa772 // smmla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4e99a78e // smmla v14.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a776 // smmla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e9aa78b // smmla v11.4s, v28.16b, v26.16b\n" + ".inst 0x4e9aa773 // smmla v19.4s, v27.16b, v26.16b\n" + ".inst 0x4e99a78f // smmla v15.4s, v28.16b, v25.16b\n" + ".inst 0x4e99a777 // smmla v23.4s, v27.16b, v25.16b\n" "138:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2125,15 +2124,15 @@ void a64_hybrid_s8s32_mmla_6x16 ( "163:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 164f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 165f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2144,10 +2143,10 @@ void a64_hybrid_s8s32_mmla_6x16 ( "b 165f\n" "164:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "165:" // Height 5: input setup done "cmp x27, #0x10\n" "blt 168f\n" @@ -2160,174 +2159,174 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q7, [x10, #0x0]\n" "blt 167f\n" "166:" // Height 5: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" "sub x27, x27, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n" "add x26, x26, #0x10\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n" "add x25, x25, #0x10\n" ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x40]\n" "add x24, x24, #0x10\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n" "cmp x27, #0x20\n" ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a457 // smmla v23.4s, v2.16b, v0.16b\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "bge 166b\n" "167:" // Height 5: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a4c8 // smmla v8.4s, v6.16b, v7.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" "add x26, x26, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4cc // smmla v12.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a454 // smmla v20.4s, v2.16b, v0.16b\n" "add x25, x25, #0x10\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49c // smmla v28.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x4e87a4c9 // smmla v9.4s, v6.16b, v7.16b\n" "add x24, x24, #0x10\n" ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x40]\n" "add x23, x23, #0x10\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4cd // smmla v13.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a455 // smmla v21.4s, v2.16b, v0.16b\n" "add x22, x22, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x4e87a4ca // smmla v10.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a4ce // smmla v14.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a456 // smmla v22.4s, v2.16b, v0.16b\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a49e // smmla v30.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x4e87a4cb // smmla v11.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x80]\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x4e80a4cf // smmla v15.4s, v6.16b, v0.16b\n" + ".inst 0x4e80a457 // smmla v23.4s, v2.16b, v0.16b\n" + ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x4e82a42c // smmla v12.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a474 // smmla v20.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bc // smmla v28.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x4e80a429 // smmla v9.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x4e82a42d // smmla v13.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a475 // smmla v21.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bd // smmla v29.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x4e80a42a // smmla v10.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4ba // smmla v26.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x4e82a42e // smmla v14.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a476 // smmla v22.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4be // smmla v30.4s, v5.16b, v2.16b\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e80a42b // smmla v11.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bb // smmla v27.4s, v5.16b, v0.16b\n" ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" @@ -2337,48 +2336,48 @@ void a64_hybrid_s8s32_mmla_6x16 ( "blt 170f\n" "169:" // Height 5: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + "ldr d0, [x22], #0x8\n" + "ldr q1, [x10, #0x0]\n" + "trn1 v2.2d, v0.2d, v2.2d\n" + ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" "cmp x27, #0x8\n" - ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" + ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x4e86a48b // smmla v11.4s, v4.16b, v6.16b\n" "add x10, x10, #0x80\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a45b // smmla v27.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" "bge 169b\n" "170:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 175f\n" @@ -2430,42 +2429,42 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr b4, [x23, #0x0]\n" "ldr b5, [x22, #0x0]\n" "174:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v5.2d, v0.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x4e86a4e8 // smmla v8.4s, v7.16b, v6.16b\n" + ".inst 0x4e86a470 // smmla v16.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x4e81a4ec // smmla v12.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a474 // smmla v20.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x4e80a4e9 // smmla v9.4s, v7.16b, v0.16b\n" + ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a459 // smmla v25.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x4e81a4ed // smmla v13.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a475 // smmla v21.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45d // smmla v29.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x4e80a4ea // smmla v10.4s, v7.16b, v0.16b\n" + ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45a // smmla v26.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x4e81a4ee // smmla v14.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a476 // smmla v22.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45e // smmla v30.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n" + ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45b // smmla v27.4s, v2.16b, v0.16b\n" + ".inst 0x4e86a4ef // smmla v15.4s, v7.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a45f // smmla v31.4s, v2.16b, v6.16b\n" "175:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2872,16 +2871,16 @@ void a64_hybrid_s8s32_mmla_6x16 ( "200:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 201f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 202f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2893,11 +2892,11 @@ void a64_hybrid_s8s32_mmla_6x16 ( "b 202f\n" "201:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "202:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 205f\n" @@ -2964,42 +2963,42 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr q2, [x25, #0x0]\n" "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x4e80a42c // smmla v12.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bc // smmla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x4e86a429 // smmla v9.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4b9 // smmla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x4e80a42d // smmla v13.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bd // smmla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x4e86a42a // smmla v10.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a472 // smmla v18.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4ba // smmla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x4e80a42e // smmla v14.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4be // smmla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e86a42b // smmla v11.4s, v1.16b, v6.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a4bb // smmla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e80a42f // smmla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" + ".inst 0x4e80a4bf // smmla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "bge 203b\n" @@ -3055,35 +3054,35 @@ void a64_hybrid_s8s32_mmla_6x16 ( ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x4e87a428 // smmla v8.4s, v1.16b, v7.16b\n" ".inst 0x4e87a470 // smmla v16.4s, v3.16b, v7.16b\n" ".inst 0x4e87a4b8 // smmla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x4e86a42c // smmla v12.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a474 // smmla v20.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bc // smmla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x4e87a429 // smmla v9.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a471 // smmla v17.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4b9 // smmla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x4e86a42d // smmla v13.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a475 // smmla v21.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4bd // smmla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x4e87a42a // smmla v10.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4ba // smmla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e86a476 // smmla v22.4s, v3.16b, v6.16b\n" - ".inst 0x4e86a4be // smmla v30.4s, v5.16b, v6.16b\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x4e82a42c // smmla v12.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a474 // smmla v20.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bc // smmla v28.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x4e80a429 // smmla v9.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x4e82a42d // smmla v13.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a475 // smmla v21.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4bd // smmla v29.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x4e80a42a // smmla v10.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4ba // smmla v26.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x4e82a42e // smmla v14.4s, v1.16b, v2.16b\n" + ".inst 0x4e82a476 // smmla v22.4s, v3.16b, v2.16b\n" + ".inst 0x4e82a4be // smmla v30.4s, v5.16b, v2.16b\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x4e87a42b // smmla v11.4s, v1.16b, v7.16b\n" - ".inst 0x4e87a473 // smmla v19.4s, v3.16b, v7.16b\n" - ".inst 0x4e87a4bb // smmla v27.4s, v5.16b, v7.16b\n" + ".inst 0x4e80a42b // smmla v11.4s, v1.16b, v0.16b\n" + ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a4bb // smmla v27.4s, v5.16b, v0.16b\n" ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" ".inst 0x4e86a4bf // smmla v31.4s, v5.16b, v6.16b\n" @@ -3093,49 +3092,49 @@ void a64_hybrid_s8s32_mmla_6x16 ( "blt 207f\n" "206:" // Height 6: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "cmp x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a450 // smmla v16.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a498 // smmla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49c // smmla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a451 // smmla v17.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a499 // smmla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49d // smmla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a452 // smmla v18.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49a // smmla v26.4s, v4.16b, v6.16b\n" + "ldr d1, [x22], #0x8\n" + "ldr d0, [x21], #0x8\n" + "trn1 v2.2d, v1.2d, v0.2d\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x4e81a488 // smmla v8.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a470 // smmla v16.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a458 // smmla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x4e80a48c // smmla v12.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a474 // smmla v20.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45c // smmla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x4e81a489 // smmla v9.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a471 // smmla v17.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a459 // smmla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x4e80a48d // smmla v13.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a475 // smmla v21.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45d // smmla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x4e81a48a // smmla v10.4s, v4.16b, v1.16b\n" + ".inst 0x4e81a472 // smmla v18.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45a // smmla v26.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x4e87a40e // smmla v14.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49e // smmla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x4e80a48e // smmla v14.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a476 // smmla v22.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45e // smmla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x4e86a40b // smmla v11.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a453 // smmla v19.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49b // smmla v27.4s, v4.16b, v6.16b\n" - ".inst 0x4e87a40f // smmla v15.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49f // smmla v31.4s, v4.16b, v7.16b\n" + ".inst 0x4e86a48b // smmla v11.4s, v4.16b, v6.16b\n" + ".inst 0x4e86a473 // smmla v19.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a45b // smmla v27.4s, v2.16b, v6.16b\n" + ".inst 0x4e80a48f // smmla v15.4s, v4.16b, v0.16b\n" + ".inst 0x4e80a477 // smmla v23.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45f // smmla v31.4s, v2.16b, v0.16b\n" "bge 206b\n" "207:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 212f\n" @@ -3194,42 +3193,42 @@ void a64_hybrid_s8s32_mmla_6x16 ( "ldr b5, [x22, #0x0]\n" "ldr b6, [x21, #0x0]\n" "211:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x4e87a450 // smmla v16.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a498 // smmla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49c // smmla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a451 // smmla v17.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a499 // smmla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x4e86a40d // smmla v13.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49d // smmla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" - ".inst 0x4e87a452 // smmla v18.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49a // smmla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x4e86a40e // smmla v14.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49e // smmla v30.4s, v4.16b, v6.16b\n" + "ldr q0, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + ".inst 0x4e80a4e8 // smmla v8.4s, v7.16b, v0.16b\n" + "trn1 v2.2d, v5.2d, v6.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x4e80a470 // smmla v16.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a458 // smmla v24.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x4e81a4ec // smmla v12.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a474 // smmla v20.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45c // smmla v28.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x4e80a4e9 // smmla v9.4s, v7.16b, v0.16b\n" + ".inst 0x4e80a471 // smmla v17.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a459 // smmla v25.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x4e81a4ed // smmla v13.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a475 // smmla v21.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45d // smmla v29.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x4e80a4ea // smmla v10.4s, v7.16b, v0.16b\n" + ".inst 0x4e80a472 // smmla v18.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45a // smmla v26.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x4e81a4ee // smmla v14.4s, v7.16b, v1.16b\n" + ".inst 0x4e81a476 // smmla v22.4s, v3.16b, v1.16b\n" + ".inst 0x4e81a45e // smmla v30.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x70]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" + ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n" "add x10, x10, #0x80\n" - ".inst 0x4e87a453 // smmla v19.4s, v2.16b, v7.16b\n" - ".inst 0x4e87a49b // smmla v27.4s, v4.16b, v7.16b\n" - ".inst 0x4e86a40f // smmla v15.4s, v0.16b, v6.16b\n" - ".inst 0x4e86a457 // smmla v23.4s, v2.16b, v6.16b\n" - ".inst 0x4e86a49f // smmla v31.4s, v4.16b, v6.16b\n" + ".inst 0x4e80a473 // smmla v19.4s, v3.16b, v0.16b\n" + ".inst 0x4e80a45b // smmla v27.4s, v2.16b, v0.16b\n" + ".inst 0x4e86a4ef // smmla v15.4s, v7.16b, v6.16b\n" + ".inst 0x4e86a477 // smmla v23.4s, v3.16b, v6.16b\n" + ".inst 0x4e86a45f // smmla v31.4s, v2.16b, v6.16b\n" "212:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3440,7 +3439,6 @@ void a64_hybrid_s8s32_mmla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "224:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp index ebc43425b8..14aba00788 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -84,7 +84,7 @@ public: case CPUModel::A510: return { 14.81 }; case CPUModel::V1: - return { 48.36 }; + return { 44.54 }; } } @@ -108,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp index b9caf545f1..00d063b426 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp @@ -78,329 +78,328 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 91f\n" "cmp %x[M], #0x2\n" "bgt 61f\n" "beq 31f\n" - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v15.16b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "mov x14, %x[output_ptr]\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "mov x13, %x[output_ptr]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" "2:" // Height 1: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" "movi v18.4s, #0x0\n" "movi v19.4s, #0x0\n" "3:" // Height 1: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "cbnz x12, 6f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "cbnz x11, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" + "add x9, x9, x20\n" "b 6f\n" "5:" // Height 1: setup direct input - "mov x10, %x[input_ptr]\n" + "mov x9, %x[input_ptr]\n" "6:" // Height 1: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 11f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr d4, [x13, #0x70]\n" - "ldr x9, [x13, #0x78]\n" + "ldr d21, [x12, #0x70]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d20, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d26, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" - "mov v4.d[1], x9\n" - "ldr x28, [x13, #0x88]\n" + "ldr d25, [x12, #0xa0]\n" + "mov v21.d[1], x20\n" + "ldr x20, [x12, #0x88]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d24, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d23, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - "mov v5.d[1], x28\n" - "ldr x27, [x13, #0x98]\n" - "mov v6.d[1], x27\n" - "ldr x26, [x13, #0xa8]\n" - "mov v7.d[1], x26\n" - "ldr x25, [x13, #0xb8]\n" - "mov v8.d[1], x25\n" - "ldr x24, [x13, #0xc8]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "ldr x20, [x13, #0xd8]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - "ldr x9, [x13, #0xe8]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - "ldr x28, [x13, #0xf8]\n" - "mov v9.d[1], x24\n" - "mov v10.d[1], x20\n" - "add x10, x10, #0x10\n" - "mov v4.d[1], x9\n" - "add x13, x13, #0x100\n" - "mov v5.d[1], x28\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "ldr d22, [x12, #0xd0]\n" + ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" + "ldr d21, [x12, #0xe0]\n" + "mov v20.d[1], x20\n" + "ldr x20, [x12, #0x98]\n" + "mov v26.d[1], x20\n" + "ldr x20, [x12, #0xa8]\n" + "mov v25.d[1], x20\n" + "ldr x20, [x12, #0xb8]\n" + "mov v24.d[1], x20\n" + "ldr x23, [x12, #0xc8]\n" + ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" + "ldr d20, [x12, #0xf0]\n" + ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" + "ldr x22, [x12, #0xd8]\n" + ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" + "ldr x21, [x12, #0xe8]\n" + ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" + "ldr x20, [x12, #0xf8]\n" + "mov v23.d[1], x23\n" + "mov v22.d[1], x22\n" + "add x9, x9, #0x10\n" + "mov v21.d[1], x21\n" + "add x12, x12, #0x100\n" + "mov v20.d[1], x20\n" + ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q4, [x13, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q4, [x12, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q21, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q20, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q26, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q25, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q24, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q23, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "sub x11, x11, #0x10\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - "add x10, x10, #0x10\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + "ldr q22, [x12, #0xd0]\n" + ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x12, #0xe0]\n" + ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x12, #0xf0]\n" + ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" + "sub x10, x10, #0x10\n" + ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" + "add x9, x9, #0x10\n" + ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" "11:" // Height 1: Multiply loop: Main loop skip - "cbz x11, 18f\n" - "cmp x11, #0x4\n" + "cbz x10, 18f\n" + "cmp x10, #0x4\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" + "ldr s0, [x9], #0x4\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q22, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q21, [x12, #0x20]\n" + ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x30]\n" + ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" + ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks - "cbz x11, 18f\n" - "tbz x11, #1, 15f\n" - "ldr h0, [x10], #0x2\n" - "tbz x11, #0, 16f\n" - "ld1 { v0.b }[2], [x10]\n" + "cbz x10, 18f\n" + "tbz x10, #1, 15f\n" + "ldr h0, [x9], #0x2\n" + "tbz x10, #0, 16f\n" + "ld1 { v0.b }[2], [x9]\n" "b 16f\n" "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" + "ldr b0, [x9, #0x0]\n" "16:" // Height 1: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" + "ldr q20, [x12, #0x0]\n" + ".inst 0x6f80e290 // udot v16.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x10]\n" + ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x20]\n" + ".inst 0x6f80e292 // udot v18.4s, v20.16b, v0.4b[0]\n" + "ldr q20, [x12, #0x30]\n" + ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 4b\n" - "prfm pstl1keep, [x14, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" "tbnz %x[flags], #31, 19f\n" "addp v11.4s, v11.4s, v11.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" - "neg v1.4s, v1.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v20.4s }, [x20]\n" + "neg v20.4s, v20.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v20.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q23, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q22, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q21, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q20, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v23.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v20.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v20.4s\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 20f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v0.16b\n" + "and v21.16b, v18.16b, v0.16b\n" + "and v20.16b, v19.16b, v0.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "20:" // Height 1: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v20.4s }, [x20]\n" + "add v16.4s, v16.4s, v20.4s\n" + "add v17.4s, v17.4s, v20.4s\n" + "add v18.4s, v18.4s, v20.4s\n" + "add v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v20.4s }, [x20]\n" + "smin v16.4s, v16.4s, v20.4s\n" + "smin v17.4s, v17.4s, v20.4s\n" + "smin v18.4s, v18.4s, v20.4s\n" + "smin v19.4s, v19.4s, v20.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" - "cmp x15, #0x10\n" + "cmp x14, #0x10\n" "uzp1 v16.16b, v16.16b, v17.16b\n" "bge 29f\n" - "tbz x15, #3, 24f\n" - "str d16, [x14], #0x8\n" - "tbz x15, #2, 22f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "tbz x15, #1, 21f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[14], [x14]\n" + "tbz x14, #3, 24f\n" + "str d16, [x13], #0x8\n" + "tbz x14, #2, 22f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "tbz x14, #1, 21f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[14], [x13]\n" "b 28f\n" "21:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[12], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[12], [x13]\n" "b 28f\n" "22:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x15, #1, 23f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[10], [x14]\n" + "tbz x14, #1, 23f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[10], [x13]\n" "b 28f\n" "23:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[8], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[8], [x13]\n" "b 28f\n" "24:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x15, #2, 26f\n" - "str s16, [x14], #0x4\n" - "tbz x15, #1, 25f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[6], [x14]\n" + "tbz x14, #2, 26f\n" + "str s16, [x13], #0x4\n" + "tbz x14, #1, 25f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[6], [x13]\n" "b 28f\n" "25:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x15, #0, 28f\n" - "st1 { v16.b }[4], [x14]\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[4], [x13]\n" "b 28f\n" "26:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x15, #1, 27f\n" - "str h16, [x14], #0x2\n" - "tbz x15, #0, 28f\n" - "st1 { v16.b }[2], [x14]\n" + "tbz x14, #1, 27f\n" + "str h16, [x13], #0x2\n" + "tbz x14, #0, 28f\n" + "st1 { v16.b }[2], [x13]\n" "b 28f\n" "27:" // Height 1: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" + "str b16, [x13, #0x0]\n" "28:" // Height 1: Partial direct writeback: Done "b 30f\n" "29:" // Height 1: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" "30:" // Height 1: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 2b\n" "b 122f\n" "31:" // Height 2 - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v15.16b, #0x1\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" "32:" // Height 2: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" @@ -411,307 +410,307 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "33:" // Height 2: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 35f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "cbnz x12, 36f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "cbnz x11, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" "b 36f\n" "35:" // Height 2: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" "36:" // Height 2: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 41f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 39f\n" "37:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d25, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v25.d[1], x20\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d24, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x23, [x12, #0x88]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d30, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x22, [x12, #0x98]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr d29, [x12, #0xa0]\n" + "ldr x21, [x12, #0xa8]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr d28, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d27, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "mov v5.d[1], x28\n" + "mov v24.d[1], x23\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "mov v6.d[1], x27\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - "mov v7.d[1], x26\n" - "ldr x24, [x13, #0xc8]\n" - "mov v8.d[1], x25\n" - "ldr x20, [x13, #0xd8]\n" - "ldr x9, [x13, #0xe8]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - "ldr x28, [x13, #0xf8]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - "mov v9.d[1], x24\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - "mov v10.d[1], x20\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - "mov v4.d[1], x9\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - "add x10, x10, #0x10\n" - "add x23, x23, #0x10\n" - "add x13, x13, #0x100\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "ldr d26, [x12, #0xd0]\n" + ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" + "mov v30.d[1], x22\n" + ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" + "ldr d25, [x12, #0xe0]\n" + "mov v29.d[1], x21\n" + "ldr x23, [x12, #0xc8]\n" + "mov v28.d[1], x20\n" + "ldr x22, [x12, #0xd8]\n" + "ldr x21, [x12, #0xe8]\n" + ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" + "ldr d24, [x12, #0xf0]\n" + "ldr x20, [x12, #0xf8]\n" + ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" + "mov v27.d[1], x23\n" + ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" + "mov v26.d[1], x22\n" + ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" + "mov v24.d[1], x20\n" + ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" + "add x9, x9, #0x10\n" + "add x28, x28, #0x10\n" + "add x12, x12, #0x100\n" + ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 38f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "38:" // Height 2: Multiply loop: unique 5: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "bge 37b\n" "39:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q25, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q24, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q30, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q29, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q28, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q27, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + "ldr q26, [x12, #0xd0]\n" + ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x12, #0xe0]\n" + ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x12, #0xf0]\n" + ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "40:" // Height 2: Multiply loop: unique 6: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" "41:" // Height 2: Multiply loop: Main loop skip - "cbz x11, 48f\n" - "cmp x11, #0x4\n" + "cbz x10, 48f\n" + "cmp x10, #0x4\n" "blt 44f\n" "42:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" "tbnz %x[flags], #31, 43f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + "ldr q27, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q26, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q25, [x12, #0x20]\n" + ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" + "ldr q24, [x12, #0x30]\n" + ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" + ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" "bge 42b\n" "44:" // Height 2: Multiply loop: Skip odd blocks - "cbz x11, 48f\n" - "tbz x11, #1, 45f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "tbz x11, #0, 46f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" + "cbz x10, 48f\n" + "tbz x10, #1, 45f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "tbz x10, #0, 46f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" "b 46f\n" "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" "46:" // Height 2: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 47f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + "ldr q24, [x12, #0x0]\n" + ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n" + "ldr q26, [x12, #0x10]\n" + ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n" + "ldr q25, [x12, #0x20]\n" + ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" + "ldr q24, [x12, #0x30]\n" + ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" "48:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 34b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "add x23, x13, x20\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "tbnz %x[flags], #31, 49f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" - "neg v2.4s, v2.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" + "neg v24.4s, v24.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "49:" // Height 2: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q27, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q26, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q25, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q24, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v27.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v25.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v27.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 50f\n" - "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" + "and v24.16b, v16.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v24.4s\n" + "and v30.16b, v17.16b, v0.16b\n" + "and v29.16b, v18.16b, v0.16b\n" + "and v28.16b, v19.16b, v0.16b\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v0.16b\n" + "and v25.16b, v22.16b, v0.16b\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "50:" // Height 2: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -721,122 +720,122 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v24.4s }, [x20]\n" + "add v16.4s, v16.4s, v24.4s\n" + "add v17.4s, v17.4s, v24.4s\n" + "add v18.4s, v18.4s, v24.4s\n" + "add v19.4s, v19.4s, v24.4s\n" + "add v20.4s, v20.4s, v24.4s\n" + "add v21.4s, v21.4s, v24.4s\n" + "add v22.4s, v22.4s, v24.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v24.4s }, [x20]\n" + "smin v16.4s, v16.4s, v24.4s\n" + "smin v17.4s, v17.4s, v24.4s\n" + "smin v18.4s, v18.4s, v24.4s\n" + "smin v19.4s, v19.4s, v24.4s\n" + "smin v20.4s, v20.4s, v24.4s\n" + "smin v21.4s, v21.4s, v24.4s\n" + "smin v22.4s, v22.4s, v24.4s\n" + "smin v23.4s, v23.4s, v24.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 59f\n" - "tbz x15, #3, 54f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "tbz x15, #2, 52f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "tbz x15, #1, 51f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" + "tbz x14, #3, 54f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "tbz x14, #2, 52f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "tbz x14, #1, 51f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" "b 58f\n" "51:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" "b 58f\n" "52:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x15, #1, 53f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" + "tbz x14, #1, 53f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" "b 58f\n" "53:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" "b 58f\n" "54:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x15, #2, 56f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "tbz x15, #1, 55f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" + "tbz x14, #2, 56f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "tbz x14, #1, 55f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" "b 58f\n" "55:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x15, #0, 58f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" "b 58f\n" "56:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x15, #1, 57f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "tbz x15, #0, 58f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" + "tbz x14, #1, 57f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "tbz x14, #0, 58f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" "b 58f\n" "57:" // Height 2: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" "58:" // Height 2: Partial direct writeback: Done "b 60f\n" "59:" // Height 2: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" "60:" // Height 2: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 32b\n" "b 122f\n" "61:" // Height 3 - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" "movi v15.16b, #0x1\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "mov x14, %x[output_ptr]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "mov x13, %x[output_ptr]\n" "62:" // Height 3: Column loop "movi v16.4s, #0x0\n" "movi v17.4s, #0x0\n" @@ -851,317 +850,317 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v26.4s, #0x0\n" "movi v27.4s, #0x0\n" "63:" // Height 3: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 65f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "cbnz x12, 66f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "cbnz x11, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" + "add x27, x27, x20\n" "b 66f\n" "65:" // Height 3: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" - "add x22, x23, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" + "add x27, x28, x21\n" "66:" // Height 3: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 71f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x27, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 69f\n" "67:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x20, [x12, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x23, [x12, #0x88]\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d29, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v29.d[1], x20\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x22, [x12, #0x98]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d28, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr x21, [x12, #0xa8]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr x20, [x12, #0xb8]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d5, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "mov v5.d[1], x28\n" + "mov v28.d[1], x23\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "mov v6.d[1], x27\n" + "mov v5.d[1], x22\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" + "ldr d4, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "mov v7.d[1], x26\n" + "mov v4.d[1], x21\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr x24, [x13, #0xc8]\n" + "ldr x23, [x12, #0xc8]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d3, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "mov v8.d[1], x25\n" + "mov v3.d[1], x20\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr x20, [x13, #0xd8]\n" + "ldr x22, [x12, #0xd8]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d31, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr x9, [x13, #0xe8]\n" + "ldr x21, [x12, #0xe8]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr x28, [x13, #0xf8]\n" + "ldr x20, [x12, #0xf8]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "mov v9.d[1], x24\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "mov v10.d[1], x20\n" - ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x9\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "add x10, x10, #0x10\n" - ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - "add x23, x23, #0x10\n" - ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" - "add x22, x22, #0x10\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "ldr d30, [x12, #0xd0]\n" + ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" + "mov v31.d[1], x23\n" + ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" + "mov v30.d[1], x22\n" + ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" + "ldr d29, [x12, #0xe0]\n" + ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" + "add x9, x9, #0x10\n" + ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" + "ldr d28, [x12, #0xf0]\n" + ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" + "mov v28.d[1], x20\n" + ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" + "add x28, x28, #0x10\n" + ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" + "add x27, x27, #0x10\n" + ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 68f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "68:" // Height 3: Multiply loop: unique 9: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q2, [x22, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x27, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" "bge 67b\n" "69:" // Height 3: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q29, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q28, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q5, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q4, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q3, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q31, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + "ldr q30, [x12, #0xd0]\n" + ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x12, #0xe0]\n" + ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x12, #0xf0]\n" + ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" + "add x12, x12, #0x100\n" + ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 70f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "70:" // Height 3: Multiply loop: unique 10: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" "71:" // Height 3: Multiply loop: Main loop skip - "cbz x11, 78f\n" - "cmp x11, #0x4\n" + "cbz x10, 78f\n" + "cmp x10, #0x4\n" "blt 74f\n" "72:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" - "ldr s2, [x22], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x27], #0x4\n" "tbnz %x[flags], #31, 73f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + "ldr q31, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q30, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q29, [x12, #0x20]\n" + ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" + "ldr q28, [x12, #0x30]\n" + ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" + ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" "bge 72b\n" "74:" // Height 3: Multiply loop: Skip odd blocks - "cbz x11, 78f\n" - "tbz x11, #1, 75f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "ldr h2, [x22], #0x2\n" - "tbz x11, #0, 76f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" - "ld1 { v2.b }[2], [x22]\n" + "cbz x10, 78f\n" + "tbz x10, #1, 75f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "tbz x10, #0, 76f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x27]\n" "b 76f\n" "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" - "ldr b2, [x22, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x27, #0x0]\n" "76:" // Height 3: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 77f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + "ldr q28, [x12, #0x0]\n" + ".inst 0x6f80e390 // udot v16.4s, v28.16b, v0.4b[0]\n" + "ldr q30, [x12, #0x10]\n" + ".inst 0x6f81e394 // udot v20.4s, v28.16b, v1.4b[0]\n" + "ldr q29, [x12, #0x20]\n" + ".inst 0x6f82e398 // udot v24.4s, v28.16b, v2.4b[0]\n" + "ldr q28, [x12, #0x30]\n" + ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" "78:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" - "add x21, x22, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" + "add x23, x13, x20\n" + "add x22, x23, x20\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbnz %x[flags], #31, 79f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" - "neg v3.4s, v3.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v28.4s }, [x20]\n" + "neg v28.4s, v28.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v28.4s\n" + "mul v12.4s, v12.4s, v28.4s\n" + "mul v13.4s, v13.4s, v28.4s\n" "79:" // Height 3: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q31, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q30, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q29, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q28, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1171,73 +1170,73 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "add v25.4s, v25.4s, v13.4s\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v31.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v31.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v31.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v28.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v28.4s\n" + "sqrdmulh v17.4s, v17.4s, v28.4s\n" + "sqrdmulh v18.4s, v18.4s, v28.4s\n" + "sqrdmulh v19.4s, v19.4s, v28.4s\n" + "sqrdmulh v20.4s, v20.4s, v28.4s\n" + "sqrdmulh v21.4s, v21.4s, v28.4s\n" + "sqrdmulh v22.4s, v22.4s, v28.4s\n" + "sqrdmulh v23.4s, v23.4s, v28.4s\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 80f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v16.16b, v0.16b\n" + "and v31.16b, v17.16b, v0.16b\n" + "and v30.16b, v18.16b, v0.16b\n" + "and v29.16b, v19.16b, v0.16b\n" + "and v28.16b, v20.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v30.4s\n" + "sqadd v19.4s, v19.4s, v29.4s\n" + "sqadd v20.4s, v20.4s, v28.4s\n" + "and v3.16b, v21.16b, v0.16b\n" + "and v2.16b, v22.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v0.16b\n" + "and v29.16b, v26.16b, v0.16b\n" + "and v28.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v3.4s\n" + "sqadd v22.4s, v22.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "80:" // Height 3: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -1251,156 +1250,156 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v28.4s }, [x20]\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "add v18.4s, v18.4s, v28.4s\n" + "add v19.4s, v19.4s, v28.4s\n" + "add v20.4s, v20.4s, v28.4s\n" + "add v21.4s, v21.4s, v28.4s\n" + "add v22.4s, v22.4s, v28.4s\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v24.4s, v24.4s, v28.4s\n" + "add v25.4s, v25.4s, v28.4s\n" + "add v26.4s, v26.4s, v28.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v23.4s, v23.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 89f\n" - "tbz x15, #3, 84f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "tbz x15, #2, 82f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "tbz x15, #1, 81f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "tbz x14, #3, 84f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x14, #2, 82f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "tbz x14, #1, 81f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 88f\n" "81:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 88f\n" "82:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x15, #1, 83f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "tbz x14, #1, 83f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 88f\n" "83:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 88f\n" "84:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x15, #2, 86f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "tbz x15, #1, 85f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "tbz x14, #2, 86f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "tbz x14, #1, 85f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 88f\n" "85:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x15, #0, 88f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 88f\n" "86:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x15, #1, 87f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "tbz x15, #0, 88f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "tbz x14, #1, 87f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "tbz x14, #0, 88f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 88f\n" "87:" // Height 3: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "88:" // Height 3: Partial direct writeback: Done "b 90f\n" "89:" // Height 3: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "90:" // Height 3: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 62b\n" "b 122f\n" "91:" // Height 4 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" "mov x20, #0x4\n" - "mov x16, %x[col_bias]\n" + "mov x15, %x[col_bias]\n" "movi v11.4s, #0x0\n" "movi v12.4s, #0x0\n" "bic %x[flags], %x[flags], #0x80000000\n" "movi v13.4s, #0x0\n" - "ldr x15, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x14, [%x[args_ptr], %[offsetof_N]]\n" "movi v14.4s, #0x0\n" - "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n" "movi v15.16b, #0x1\n" - "mov x14, %x[output_ptr]\n" + "mov x13, %x[output_ptr]\n" "madd %x[output_ptr], x21, x20, %x[output_ptr]\n" "92:" // Height 4: Column loop "movi v16.4s, #0x0\n" @@ -1420,117 +1419,117 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "movi v30.4s, #0x0\n" "movi v31.4s, #0x0\n" "93:" // Height 4: setup done - "mov x12, #0x0\n" + "mov x11, #0x0\n" "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w11, [x20, x12, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w10, [x20, x11, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 95f\n" - "ldr x21, [%x[input_ptr], x12, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x10, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" - "cbnz x12, 96f\n" + "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x9, [x20, #0x0]\n" + "ldr x28, [x20, #0x8]\n" + "ldr x27, [x20, #0x10]\n" + "ldr x26, [x20, #0x18]\n" + "cbnz x11, 96f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x10, x10, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" - "add x21, x21, x20\n" + "add x9, x9, x20\n" + "add x28, x28, x20\n" + "add x27, x27, x20\n" + "add x26, x26, x20\n" "b 96f\n" "95:" // Height 4: setup direct input - "mov x10, %x[input_ptr]\n" - "add x23, x10, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "mov x9, %x[input_ptr]\n" + "add x28, x9, x21\n" + "add x27, x28, x21\n" + "add x26, x27, x21\n" "96:" // Height 4: input setup done - "cmp x11, #0x10\n" + "cmp x10, #0x10\n" "blt 101f\n" - "ldr q0, [x10, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q1, [x23, #0x0]\n" - "ldr q2, [x22, #0x0]\n" - "ldr q3, [x21, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" + "ldr q0, [x9, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q1, [x28, #0x0]\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x26, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" "blt 99f\n" "97:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr x9, [x13, #0x78]\n" + "ldr x22, [x12, #0x78]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x28, [x13, #0x88]\n" + "ldr x21, [x12, #0x88]\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr x27, [x13, #0x98]\n" + "ldr x20, [x12, #0x98]\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr d4, [x13, #0x70]\n" + "ldr d4, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "mov v4.d[1], x9\n" + "mov v4.d[1], x22\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr x26, [x13, #0xa8]\n" + "ldr x25, [x12, #0xa8]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr x25, [x13, #0xb8]\n" + "ldr x24, [x12, #0xb8]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr d5, [x13, #0x80]\n" + "ldr d5, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "mov v5.d[1], x28\n" + "mov v5.d[1], x21\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr x24, [x13, #0xc8]\n" + "ldr x23, [x12, #0xc8]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr x20, [x13, #0xd8]\n" + "ldr x22, [x12, #0xd8]\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x13, #0x90]\n" + "ldr d6, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x27\n" + "mov v6.d[1], x20\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr x9, [x13, #0xe8]\n" + "ldr x21, [x12, #0xe8]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr x28, [x13, #0xf8]\n" + "ldr x20, [x12, #0xf8]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x13, #0xa0]\n" + "ldr d7, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "mov v7.d[1], x26\n" + "mov v7.d[1], x25\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr d8, [x13, #0xb0]\n" + "ldr d8, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "mov v8.d[1], x25\n" + "mov v8.d[1], x24\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "add x21, x21, #0x10\n" + "add x26, x26, #0x10\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr d9, [x13, #0xc0]\n" + "ldr d9, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "mov v9.d[1], x24\n" + "mov v9.d[1], x23\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr d10, [x13, #0xd0]\n" + "ldr d10, [x12, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "mov v10.d[1], x20\n" + "mov v10.d[1], x22\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr d4, [x13, #0xe0]\n" + "ldr d4, [x12, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x9\n" + "mov v4.d[1], x21\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr d5, [x13, #0xf0]\n" + "ldr d5, [x12, #0xf0]\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "mov v5.d[1], x28\n" + "mov v5.d[1], x20\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - "add x13, x13, #0x100\n" + "add x12, x12, #0x100\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" @@ -1563,77 +1562,77 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "98:" // Height 4: Multiply loop: unique 13: skip row sum - "ldr q0, [x10, #0x0]\n" - "sub x11, x11, #0x10\n" - "ldr q1, [x23, #0x0]\n" - "cmp x11, #0x20\n" - "ldr q2, [x22, #0x0]\n" - "ldr q3, [x21, #0x0]\n" - "ldr q4, [x13, #0x0]\n" - "ldr q5, [x13, #0x10]\n" - "ldr q6, [x13, #0x20]\n" - "ldr q7, [x13, #0x30]\n" - "ldr q8, [x13, #0x40]\n" - "ldr q9, [x13, #0x50]\n" - "ldr q10, [x13, #0x60]\n" - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "ldr q0, [x9, #0x0]\n" + "sub x10, x10, #0x10\n" + "ldr q1, [x28, #0x0]\n" + "cmp x10, #0x20\n" + "ldr q2, [x27, #0x0]\n" + "ldr q3, [x26, #0x0]\n" + "ldr q4, [x12, #0x0]\n" + "ldr q5, [x12, #0x10]\n" + "ldr q6, [x12, #0x20]\n" + "ldr q7, [x12, #0x30]\n" + "ldr q8, [x12, #0x40]\n" + "ldr q9, [x12, #0x50]\n" + "ldr q10, [x12, #0x60]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "bge 97b\n" "99:" // Height 4: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "sub x11, x11, #0x10\n" + "sub x10, x10, #0x10\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "add x10, x10, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "add x23, x23, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n" - "ldr q4, [x13, #0x70]\n" + "ldr q4, [x12, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "add x22, x22, #0x10\n" + "add x27, x27, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "add x21, x21, #0x10\n" + "add x26, x26, #0x10\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n" - "ldr q5, [x13, #0x80]\n" + "ldr q5, [x12, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x13, #0x90]\n" + "ldr q6, [x12, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x13, #0xa0]\n" + "ldr q7, [x12, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n" - "ldr q8, [x13, #0xb0]\n" + "ldr q8, [x12, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n" - "ldr q9, [x13, #0xc0]\n" + "ldr q9, [x12, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n" - "ldr q10, [x13, #0xd0]\n" + "ldr q10, [x12, #0xd0]\n" ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n" - "ldr q4, [x13, #0xe0]\n" + "ldr q4, [x12, #0xe0]\n" ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n" - "ldr q5, [x13, #0xf0]\n" + "ldr q5, [x12, #0xf0]\n" ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - "add x13, x13, #0x100\n" + "add x12, x12, #0x100\n" ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n" @@ -1667,67 +1666,67 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "100:" // Height 4: Multiply loop: unique 14: skip row sum - "prfm pldl1keep, [x10, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" + "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x26, #0x80]\n" "101:" // Height 4: Multiply loop: Main loop skip - "cbz x11, 108f\n" - "cmp x11, #0x4\n" + "cbz x10, 108f\n" + "cmp x10, #0x4\n" "blt 104f\n" "102:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x10], #0x4\n" - "ldr s1, [x23], #0x4\n" - "ldr s2, [x22], #0x4\n" - "ldr s3, [x21], #0x4\n" + "ldr s0, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s2, [x27], #0x4\n" + "ldr s3, [x26], #0x4\n" "tbnz %x[flags], #31, 103f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q6, [x13, #0x0]\n" - "sub x11, x11, #0x4\n" - "ldr q7, [x13, #0x10]\n" - "cmp x11, #0x4\n" - "ldr q8, [x13, #0x20]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - "ldr q9, [x13, #0x30]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x0]\n" + "sub x10, x10, #0x4\n" + "ldr q6, [x12, #0x10]\n" + "cmp x10, #0x4\n" + "ldr q5, [x12, #0x20]\n" + ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" + "ldr q4, [x12, #0x30]\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" "bge 102b\n" "104:" // Height 4: Multiply loop: Skip odd blocks - "cbz x11, 108f\n" - "tbz x11, #1, 105f\n" - "ldr h0, [x10], #0x2\n" - "ldr h1, [x23], #0x2\n" - "ldr h2, [x22], #0x2\n" - "ldr h3, [x21], #0x2\n" - "tbz x11, #0, 106f\n" - "ld1 { v0.b }[2], [x10]\n" - "ld1 { v1.b }[2], [x23]\n" - "ld1 { v2.b }[2], [x22]\n" - "ld1 { v3.b }[2], [x21]\n" + "cbz x10, 108f\n" + "tbz x10, #1, 105f\n" + "ldr h0, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h3, [x26], #0x2\n" + "tbz x10, #0, 106f\n" + "ld1 { v0.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x28]\n" + "ld1 { v2.b }[2], [x27]\n" + "ld1 { v3.b }[2], [x26]\n" "b 106f\n" "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr b0, [x10, #0x0]\n" - "ldr b1, [x23, #0x0]\n" - "ldr b2, [x22, #0x0]\n" - "ldr b3, [x21, #0x0]\n" + "ldr b0, [x9, #0x0]\n" + "ldr b1, [x28, #0x0]\n" + "ldr b2, [x27, #0x0]\n" + "ldr b3, [x26, #0x0]\n" "106:" // Height 4: Multiply loop: Ragged operand read: Done "tbnz %x[flags], #31, 107f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" @@ -1735,64 +1734,64 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x13, #0x0]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - "ldr q4, [x13, #0x10]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x13, #0x20]\n" - ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" - "ldr q6, [x13, #0x30]\n" - ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - "add x13, x13, #0x40\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" + "ldr q7, [x12, #0x0]\n" + ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x12, #0x10]\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" + "ldr q5, [x12, #0x20]\n" + ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" + "ldr q4, [x12, #0x30]\n" + ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + "add x12, x12, #0x40\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" "108:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x12, x12, #0x1\n" - "cmp x12, x20\n" + "add x11, x11, #0x1\n" + "cmp x11, x20\n" "bne 94b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x14, x20\n" + "add x23, x13, x20\n" + "add x22, x23, x20\n" "add x21, x22, x20\n" - "add x20, x21, x20\n" - "prfm pstl1keep, [x14, #0x0]\n" + "prfm pstl1keep, [x13, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" "prfm pstl1keep, [x21, #0x0]\n" - "prfm pstl1keep, [x20, #0x0]\n" "tbnz %x[flags], #31, 109f\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "neg v4.4s, v4.4s\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" + "neg v0.4s, v0.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "109:" // Height 4: skip row sum fixup - "ldr q0, [x16, #0x0]\n" + "ldr q3, [x15, #0x0]\n" "add v16.4s, v16.4s, v11.4s\n" - "ldr q1, [x16, #0x10]\n" + "ldr q2, [x15, #0x10]\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x16, #0x20]\n" + "ldr q1, [x15, #0x20]\n" "add v18.4s, v18.4s, v11.4s\n" - "ldr q3, [x16, #0x30]\n" + "ldr q0, [x15, #0x30]\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" @@ -1806,93 +1805,93 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "add v29.4s, v29.4s, v14.4s\n" "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "add v28.4s, v28.4s, v0.4s\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v2.4s\n" + "add v18.4s, v18.4s, v1.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v2.4s\n" + "add v22.4s, v22.4s, v1.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v2.4s\n" + "add v26.4s, v26.4s, v1.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v2.4s\n" + "add v30.4s, v30.4s, v1.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "add x16, x16, #0x40\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" + "add x15, x15, #0x40\n" "tbz %x[flags], #5, 110f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v16.16b, v0.16b\n" + "and v1.16b, v17.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v2.4s\n" + "sqadd v17.4s, v17.4s, v1.4s\n" + "and v7.16b, v18.16b, v0.16b\n" + "and v6.16b, v19.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v4.16b, v21.16b, v0.16b\n" + "and v3.16b, v22.16b, v0.16b\n" + "and v2.16b, v23.16b, v0.16b\n" + "and v1.16b, v24.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" - "and v10.16b, v29.16b, v0.16b\n" - "and v4.16b, v30.16b, v0.16b\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v7.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "sqadd v22.4s, v22.4s, v3.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "sqadd v24.4s, v24.4s, v1.4s\n" + "and v7.16b, v25.16b, v0.16b\n" + "and v6.16b, v26.16b, v0.16b\n" + "and v5.16b, v27.16b, v0.16b\n" + "and v4.16b, v28.16b, v0.16b\n" + "and v3.16b, v29.16b, v0.16b\n" + "and v2.16b, v30.16b, v0.16b\n" + "and v1.16b, v31.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sqadd v28.4s, v28.4s, v9.4s\n" - "sqadd v29.4s, v29.4s, v10.4s\n" - "sqadd v30.4s, v30.4s, v4.4s\n" - "sqadd v31.4s, v31.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "sqadd v29.4s, v29.4s, v3.4s\n" + "sqadd v30.4s, v30.4s, v2.4s\n" + "sqadd v31.4s, v31.4s, v1.4s\n" "110:" // Height 4: no shift correction "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" @@ -1910,172 +1909,172 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v0.4s }, [x20]\n" + "add v16.4s, v16.4s, v0.4s\n" + "add v17.4s, v17.4s, v0.4s\n" + "add v18.4s, v18.4s, v0.4s\n" + "add v19.4s, v19.4s, v0.4s\n" + "add v20.4s, v20.4s, v0.4s\n" + "add v21.4s, v21.4s, v0.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "add v24.4s, v24.4s, v0.4s\n" + "add v25.4s, v25.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v29.4s, v29.4s, v0.4s\n" + "add v30.4s, v30.4s, v0.4s\n" + "add v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v0.4s }, [x20]\n" + "smin v16.4s, v16.4s, v0.4s\n" + "smin v17.4s, v17.4s, v0.4s\n" + "smin v18.4s, v18.4s, v0.4s\n" + "smin v19.4s, v19.4s, v0.4s\n" + "smin v20.4s, v20.4s, v0.4s\n" + "smin v21.4s, v21.4s, v0.4s\n" + "smin v22.4s, v22.4s, v0.4s\n" + "smin v23.4s, v23.4s, v0.4s\n" + "smin v24.4s, v24.4s, v0.4s\n" + "smin v25.4s, v25.4s, v0.4s\n" + "smin v26.4s, v26.4s, v0.4s\n" + "smin v27.4s, v27.4s, v0.4s\n" + "smin v28.4s, v28.4s, v0.4s\n" + "smin v29.4s, v29.4s, v0.4s\n" + "smin v30.4s, v30.4s, v0.4s\n" + "smin v31.4s, v31.4s, v0.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v0.4s }, [x20]\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smax v21.4s, v21.4s, v0.4s\n" + "smax v22.4s, v22.4s, v0.4s\n" + "smax v23.4s, v23.4s, v0.4s\n" + "smax v24.4s, v24.4s, v0.4s\n" + "smax v25.4s, v25.4s, v0.4s\n" + "smax v26.4s, v26.4s, v0.4s\n" + "smax v27.4s, v27.4s, v0.4s\n" + "smax v28.4s, v28.4s, v0.4s\n" + "smax v29.4s, v29.4s, v0.4s\n" + "smax v30.4s, v30.4s, v0.4s\n" + "smax v31.4s, v31.4s, v0.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" - "cmp x15, #0x10\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" + "cmp x14, #0x10\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 119f\n" - "tbz x15, #3, 114f\n" - "str d16, [x14], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" - "tbz x15, #2, 112f\n" - "st1 { v16.s }[2], [x14], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" - "tbz x15, #1, 111f\n" - "st1 { v16.h }[6], [x14], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[14], [x14]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "tbz x14, #3, 114f\n" + "str d16, [x13], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" + "tbz x14, #2, 112f\n" + "st1 { v16.s }[2], [x13], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" + "tbz x14, #1, 111f\n" + "st1 { v16.h }[6], [x13], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[14], [x13]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 118f\n" "111:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[12], [x14]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[12], [x13]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 118f\n" "112:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x15, #1, 113f\n" - "st1 { v16.h }[4], [x14], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[10], [x14]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "tbz x14, #1, 113f\n" + "st1 { v16.h }[4], [x13], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[10], [x13]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 118f\n" "113:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[8], [x14]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[8], [x13]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 118f\n" "114:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x15, #2, 116f\n" - "str s16, [x14], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" - "tbz x15, #1, 115f\n" - "st1 { v16.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[6], [x14]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "tbz x14, #2, 116f\n" + "str s16, [x13], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" + "tbz x14, #1, 115f\n" + "st1 { v16.h }[2], [x13], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[6], [x13]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 118f\n" "115:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x15, #0, 118f\n" - "st1 { v16.b }[4], [x14]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[4], [x13]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 118f\n" "116:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x15, #1, 117f\n" - "str h16, [x14], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" - "tbz x15, #0, 118f\n" - "st1 { v16.b }[2], [x14]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "tbz x14, #1, 117f\n" + "str h16, [x13], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" + "tbz x14, #0, 118f\n" + "st1 { v16.b }[2], [x13]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 118f\n" "117:" // Height 4: Partial direct writeback: partial_1_0 - "str b16, [x14, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b16, [x13, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "118:" // Height 4: Partial direct writeback: Done "b 120f\n" "119:" // Height 4: Full writeback - "str q16, [x14, #0x0]\n" - "add x14, x14, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q16, [x13, #0x0]\n" + "add x13, x13, #0x10\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "120:" // Height 4: Writeback done - "subs x15, x15, #0x10\n" + "subs x14, x14, #0x10\n" "bgt 92b\n" "subs %x[M], %x[M], #0x4\n" "beq 122f\n" @@ -2089,10 +2088,9 @@ void a64_hybrid_u8qa_dot_4x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "122:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp index 31fbf88603..ebe583b5d4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp @@ -78,7 +78,6 @@ void a64_hybrid_u8qa_dot_4x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 91f\n" @@ -102,11 +101,11 @@ void a64_hybrid_u8qa_dot_4x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -128,32 +127,32 @@ void a64_hybrid_u8qa_dot_4x16 ( "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q21, [x28, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q20, [x28, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q26, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q25, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q24, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q23, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q22, [x28, #0xd0]\n" + ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x28, #0xe0]\n" + ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x28, #0xf0]\n" + ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" "add x24, x24, #0x10\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "8:" // Height 1: Multiply loop: unique 1: skip row sum @@ -171,33 +170,33 @@ void a64_hybrid_u8qa_dot_4x16 ( "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q21, [x28, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q20, [x28, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q26, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q25, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q24, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q23, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q22, [x28, #0xd0]\n" + ".inst 0x6fa0e2b3 // udot v19.4s, v21.16b, v0.4b[1]\n" + "ldr q21, [x28, #0xe0]\n" + ".inst 0x6f80ea90 // udot v16.4s, v20.16b, v0.4b[2]\n" + "ldr q20, [x28, #0xf0]\n" + ".inst 0x6f80eb51 // udot v17.4s, v26.16b, v0.4b[2]\n" "sub x25, x25, #0x10\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6f80eb32 // udot v18.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f80eb13 // udot v19.4s, v24.16b, v0.4b[2]\n" "add x24, x24, #0x10\n" "add x28, x28, #0x100\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6fa0eaf0 // udot v16.4s, v23.16b, v0.4b[3]\n" + ".inst 0x6fa0ead1 // udot v17.4s, v22.16b, v0.4b[3]\n" + ".inst 0x6fa0eab2 // udot v18.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa0ea93 // udot v19.4s, v20.16b, v0.4b[3]\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "10:" // Height 1: Multiply loop: unique 2: skip row sum @@ -211,16 +210,16 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q23, [x28, #0x0]\n" + "ldr q22, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x20]\n" + "ldr q20, [x28, #0x30]\n" + ".inst 0x6f80e2f0 // udot v16.4s, v23.16b, v0.4b[0]\n" + ".inst 0x6f80e2d1 // udot v17.4s, v22.16b, v0.4b[0]\n" + ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks @@ -236,14 +235,14 @@ void a64_hybrid_u8qa_dot_4x16 ( "tbnz %x[flags], #31, 17f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "17:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x0]\n" + "ldr q20, [x28, #0x10]\n" + ".inst 0x6f80e2b0 // udot v16.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f80e291 // udot v17.4s, v20.16b, v0.4b[0]\n" + "ldr q21, [x28, #0x20]\n" + "ldr q20, [x28, #0x30]\n" + ".inst 0x6f80e2b2 // udot v18.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f80e293 // udot v19.4s, v20.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" "18:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -252,72 +251,72 @@ void a64_hybrid_u8qa_dot_4x16 ( "bne 4b\n" "prfm pstl1keep, [x27, #0x0]\n" "tbnz %x[flags], #31, 19f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v20.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v1.4s, v1.4s\n" + "neg v20.4s, v20.4s\n" "addp v11.4s, v11.4s, v11.4s\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v20.4s\n" "19:" // Height 1: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q24, [x10, #0x0]\n" + "ldr q23, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q22, [x10, #0x20]\n" + "ldr q21, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v20.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" + "add v16.4s, v16.4s, v24.4s\n" + "add v17.4s, v17.4s, v23.4s\n" + "add v18.4s, v18.4s, v22.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v19.4s, v19.4s, v21.4s\n" + "sqrdmulh v16.4s, v16.4s, v20.4s\n" "add x10, x10, #0x40\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" "tbz %x[flags], #5, 20f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v23.16b, v16.16b, v0.16b\n" + "and v22.16b, v17.16b, v0.16b\n" + "and v21.16b, v18.16b, v0.16b\n" + "and v20.16b, v19.16b, v0.16b\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v20.4s\n" "20:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v22.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v21.4s }, [x20]\n" + "add v16.4s, v16.4s, v22.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v20.4s }, [x20]\n" + "add v18.4s, v18.4s, v22.4s\n" + "add v19.4s, v19.4s, v22.4s\n" "cmp x9, #0x10\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "smin v16.4s, v16.4s, v21.4s\n" + "smin v17.4s, v17.4s, v21.4s\n" + "smin v18.4s, v18.4s, v21.4s\n" + "smin v19.4s, v19.4s, v21.4s\n" + "smax v16.4s, v16.4s, v20.4s\n" + "smax v17.4s, v17.4s, v20.4s\n" + "smax v18.4s, v18.4s, v20.4s\n" + "smax v19.4s, v19.4s, v20.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" "uzp1 v17.8h, v18.8h, v19.8h\n" "uzp1 v16.16b, v16.16b, v17.16b\n" @@ -397,12 +396,12 @@ void a64_hybrid_u8qa_dot_4x16 ( "34:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 35f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -410,7 +409,7 @@ void a64_hybrid_u8qa_dot_4x16 ( "b 36f\n" "35:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "36:" // Height 2: input setup done "cmp x25, #0x10\n" "blt 41f\n" @@ -428,48 +427,48 @@ void a64_hybrid_u8qa_dot_4x16 ( "37:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" "add x24, x24, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "add x23, x23, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x28, #0xf0]\n" "add x28, x28, #0x100\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 38f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" @@ -491,49 +490,49 @@ void a64_hybrid_u8qa_dot_4x16 ( "39:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n" ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" "sub x25, x25, #0x10\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "add x24, x24, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" "add x23, x23, #0x10\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6fa0e333 // udot v19.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e337 // udot v23.4s, v25.16b, v1.4b[1]\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6f80eb10 // udot v16.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb14 // udot v20.4s, v24.16b, v1.4b[2]\n" + "ldr q24, [x28, #0xf0]\n" "add x28, x28, #0x100\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" + ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n" + ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n" + ".inst 0x6f80ebb2 // udot v18.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebb6 // udot v22.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f80eb93 // udot v19.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb97 // udot v23.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6fa0eb70 // udot v16.4s, v27.16b, v0.4b[3]\n" + ".inst 0x6fa1eb74 // udot v20.4s, v27.16b, v1.4b[3]\n" + ".inst 0x6fa0eb51 // udot v17.4s, v26.16b, v0.4b[3]\n" + ".inst 0x6fa1eb55 // udot v21.4s, v26.16b, v1.4b[3]\n" + ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb36 // udot v22.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa0eb13 // udot v19.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb17 // udot v23.4s, v24.16b, v1.4b[3]\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" @@ -551,21 +550,21 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "43:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q27, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x6f80e370 // udot v16.4s, v27.16b, v0.4b[0]\n" + ".inst 0x6f81e374 // udot v20.4s, v27.16b, v1.4b[0]\n" + ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" + ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" + ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" "bge 42b\n" "44:" // Height 2: Multiply loop: Skip odd blocks "cbz x25, 48f\n" @@ -584,209 +583,209 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" "47:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x6f80e310 // udot v16.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e314 // udot v20.4s, v24.16b, v1.4b[0]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x6f80e351 // udot v17.4s, v26.16b, v0.4b[0]\n" + ".inst 0x6f81e355 // udot v21.4s, v26.16b, v1.4b[0]\n" + ".inst 0x6f80e332 // udot v18.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e336 // udot v22.4s, v25.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e313 // udot v19.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e317 // udot v23.4s, v24.16b, v1.4b[0]\n" "48:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 34b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" + "add x23, x27, x20\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "tbnz %x[flags], #31, 49f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "neg v2.4s, v2.4s\n" + "neg v24.4s, v24.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "49:" // Height 2: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q27, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q26, [x10, #0x20]\n" + "ldr q25, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v27.4s\n" "add x10, x10, #0x40\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "add v20.4s, v20.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v25.4s\n" + "add v20.4s, v20.4s, v28.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v21.4s, v21.4s, v27.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v23.4s, v23.4s, v25.4s\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" "tbz %x[flags], #5, 50f\n" - "and v4.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" + "and v24.16b, v16.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v24.4s\n" + "and v30.16b, v17.16b, v0.16b\n" + "and v29.16b, v18.16b, v0.16b\n" + "and v28.16b, v19.16b, v0.16b\n" + "and v27.16b, v20.16b, v0.16b\n" + "and v26.16b, v21.16b, v0.16b\n" + "and v25.16b, v22.16b, v0.16b\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v29.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v20.4s, v20.4s, v27.4s\n" + "sqadd v21.4s, v21.4s, v26.4s\n" + "sqadd v22.4s, v22.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v24.4s\n" "50:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" + "uzp1 v17.8h, v22.8h, v23.8h\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v20.16b, v20.16b, v17.16b\n" "bge 59f\n" "tbz x9, #3, 54f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" + "str d20, [x23], #0x8\n" "tbz x9, #2, 52f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" "tbz x9, #1, 51f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" + "st1 { v20.b }[14], [x23]\n" "b 58f\n" "51:" // Height 2: Partial direct writeback: partial_1_12 "tbz x9, #0, 58f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" + "st1 { v20.b }[12], [x23]\n" "b 58f\n" "52:" // Height 2: Partial direct writeback: partial_2_8 "tbz x9, #1, 53f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" + "st1 { v20.b }[10], [x23]\n" "b 58f\n" "53:" // Height 2: Partial direct writeback: partial_1_8 "tbz x9, #0, 58f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" + "st1 { v20.b }[8], [x23]\n" "b 58f\n" "54:" // Height 2: Partial direct writeback: partial_4_0 "tbz x9, #2, 56f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" + "str s20, [x23], #0x4\n" "tbz x9, #1, 55f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" + "st1 { v20.b }[6], [x23]\n" "b 58f\n" "55:" // Height 2: Partial direct writeback: partial_1_4 "tbz x9, #0, 58f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" + "st1 { v20.b }[4], [x23]\n" "b 58f\n" "56:" // Height 2: Partial direct writeback: partial_2_0 "tbz x9, #1, 57f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" + "str h20, [x23], #0x2\n" "tbz x9, #0, 58f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" + "st1 { v20.b }[2], [x23]\n" "b 58f\n" "57:" // Height 2: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" + "str b20, [x23, #0x0]\n" "58:" // Height 2: Partial direct writeback: Done "b 60f\n" "59:" // Height 2: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" + "str q20, [x23, #0x0]\n" "60:" // Height 2: Writeback done "subs x9, x9, #0x10\n" "bgt 32b\n" @@ -819,13 +818,13 @@ void a64_hybrid_u8qa_dot_4x16 ( "64:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 65f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 66f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -834,8 +833,8 @@ void a64_hybrid_u8qa_dot_4x16 ( "b 66f\n" "65:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "66:" // Height 3: input setup done "cmp x25, #0x10\n" "blt 71f\n" @@ -857,62 +856,62 @@ void a64_hybrid_u8qa_dot_4x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q29, [x28, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q28, [x28, #0x80]\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q5, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q4, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q3, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q31, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q30, [x28, #0xd0]\n" + ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x28, #0xe0]\n" + ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x28, #0xf0]\n" + ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 68f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" @@ -940,63 +939,63 @@ void a64_hybrid_u8qa_dot_4x16 ( "sub x25, x25, #0x10\n" "add x24, x24, #0x10\n" ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n" - "ldr q4, [x28, #0x70]\n" + "ldr q29, [x28, #0x70]\n" ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n" "add x23, x23, #0x10\n" ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n" - "ldr q5, [x28, #0x80]\n" + "ldr q28, [x28, #0x80]\n" "add x22, x22, #0x10\n" ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x28, #0x90]\n" + "ldr q5, [x28, #0x90]\n" ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x28, #0xa0]\n" + "ldr q4, [x28, #0xa0]\n" ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n" ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n" ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n" - "ldr q8, [x28, #0xb0]\n" + "ldr q3, [x28, #0xb0]\n" ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n" ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n" ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n" - "ldr q9, [x28, #0xc0]\n" + "ldr q31, [x28, #0xc0]\n" ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n" ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n" ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n" - "ldr q10, [x28, #0xd0]\n" - ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n" - "ldr q4, [x28, #0xe0]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n" - "ldr q5, [x28, #0xf0]\n" - ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n" + "ldr q30, [x28, #0xd0]\n" + ".inst 0x6fa0e3b3 // udot v19.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3b7 // udot v23.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3bb // udot v27.4s, v29.16b, v2.4b[1]\n" + "ldr q29, [x28, #0xe0]\n" + ".inst 0x6f80eb90 // udot v16.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb94 // udot v20.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb98 // udot v24.4s, v28.16b, v2.4b[2]\n" + "ldr q28, [x28, #0xf0]\n" + ".inst 0x6f80e8b1 // udot v17.4s, v5.16b, v0.4b[2]\n" "add x28, x28, #0x100\n" - ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" - ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n" - ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n" - ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n" - ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n" - ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n" - ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n" - ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n" - ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6f81e8b5 // udot v21.4s, v5.16b, v1.4b[2]\n" + ".inst 0x6f82e8b9 // udot v25.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6f80e892 // udot v18.4s, v4.16b, v0.4b[2]\n" + ".inst 0x6f81e896 // udot v22.4s, v4.16b, v1.4b[2]\n" + ".inst 0x6f82e89a // udot v26.4s, v4.16b, v2.4b[2]\n" + ".inst 0x6f80e873 // udot v19.4s, v3.16b, v0.4b[2]\n" + ".inst 0x6f81e877 // udot v23.4s, v3.16b, v1.4b[2]\n" + ".inst 0x6f82e87b // udot v27.4s, v3.16b, v2.4b[2]\n" + ".inst 0x6fa0ebf0 // udot v16.4s, v31.16b, v0.4b[3]\n" + ".inst 0x6fa1ebf4 // udot v20.4s, v31.16b, v1.4b[3]\n" + ".inst 0x6fa2ebf8 // udot v24.4s, v31.16b, v2.4b[3]\n" + ".inst 0x6fa0ebd1 // udot v17.4s, v30.16b, v0.4b[3]\n" + ".inst 0x6fa1ebd5 // udot v21.4s, v30.16b, v1.4b[3]\n" + ".inst 0x6fa2ebd9 // udot v25.4s, v30.16b, v2.4b[3]\n" + ".inst 0x6fa0ebb2 // udot v18.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebb6 // udot v22.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebba // udot v26.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa0eb93 // udot v19.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb97 // udot v23.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb9b // udot v27.4s, v28.16b, v2.4b[3]\n" "tbnz %x[flags], #31, 70f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" @@ -1018,25 +1017,25 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "73:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q31, [x28, #0x0]\n" + "ldr q30, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" + "ldr q29, [x28, #0x20]\n" + "ldr q28, [x28, #0x30]\n" + ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" + ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" + ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" + ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" "bge 72b\n" "74:" // Height 3: Multiply loop: Skip odd blocks "cbz x25, 78f\n" @@ -1059,144 +1058,144 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" + "ldr q31, [x28, #0x0]\n" + "ldr q30, [x28, #0x10]\n" + ".inst 0x6f80e3f0 // udot v16.4s, v31.16b, v0.4b[0]\n" + ".inst 0x6f81e3f4 // udot v20.4s, v31.16b, v1.4b[0]\n" + "ldr q29, [x28, #0x20]\n" + "ldr q28, [x28, #0x30]\n" + ".inst 0x6f82e3f8 // udot v24.4s, v31.16b, v2.4b[0]\n" + ".inst 0x6f80e3d1 // udot v17.4s, v30.16b, v0.4b[0]\n" + ".inst 0x6f81e3d5 // udot v21.4s, v30.16b, v1.4b[0]\n" + ".inst 0x6f82e3d9 // udot v25.4s, v30.16b, v2.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3b6 // udot v22.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3ba // udot v26.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e397 // udot v23.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e39b // udot v27.4s, v28.16b, v2.4b[0]\n" "78:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 64b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "prfm pstl1keep, [x27, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "tbnz %x[flags], #31, 79f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v28.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" + "neg v28.4s, v28.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v28.4s\n" + "mul v12.4s, v12.4s, v28.4s\n" + "mul v13.4s, v13.4s, v28.4s\n" "79:" // Height 3: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q31, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q30, [x10, #0x20]\n" + "ldr q29, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v28.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v31.4s\n" + "add v18.4s, v18.4s, v30.4s\n" + "add v19.4s, v19.4s, v29.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v21.4s, v21.4s, v31.4s\n" + "add v22.4s, v22.4s, v30.4s\n" + "add v23.4s, v23.4s, v29.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v31.4s\n" + "add v26.4s, v26.4s, v30.4s\n" + "add v27.4s, v27.4s, v29.4s\n" + "sqrdmulh v16.4s, v16.4s, v28.4s\n" + "sqrdmulh v17.4s, v17.4s, v28.4s\n" + "sqrdmulh v18.4s, v18.4s, v28.4s\n" + "sqrdmulh v19.4s, v19.4s, v28.4s\n" + "sqrdmulh v20.4s, v20.4s, v28.4s\n" + "sqrdmulh v21.4s, v21.4s, v28.4s\n" + "sqrdmulh v22.4s, v22.4s, v28.4s\n" + "sqrdmulh v23.4s, v23.4s, v28.4s\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" "tbz %x[flags], #5, 80f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v16.16b, v0.16b\n" + "and v31.16b, v17.16b, v0.16b\n" + "and v30.16b, v18.16b, v0.16b\n" + "and v29.16b, v19.16b, v0.16b\n" + "and v28.16b, v20.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v30.4s\n" + "sqadd v19.4s, v19.4s, v29.4s\n" + "sqadd v20.4s, v20.4s, v28.4s\n" + "and v3.16b, v21.16b, v0.16b\n" + "and v2.16b, v22.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" + "and v31.16b, v24.16b, v0.16b\n" + "and v30.16b, v25.16b, v0.16b\n" + "and v29.16b, v26.16b, v0.16b\n" + "and v28.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v3.4s\n" + "sqadd v22.4s, v22.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "sqadd v27.4s, v27.4s, v28.4s\n" "80:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v30.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1204,132 +1203,132 @@ void a64_hybrid_u8qa_dot_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v16.4s, v16.4s, v30.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v30.4s\n" + "add v19.4s, v19.4s, v30.4s\n" + "add v20.4s, v20.4s, v30.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "add v22.4s, v22.4s, v30.4s\n" + "add v23.4s, v23.4s, v30.4s\n" + "add v24.4s, v24.4s, v30.4s\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v30.4s\n" + "add v27.4s, v27.4s, v30.4s\n" + "smin v16.4s, v16.4s, v29.4s\n" + "smin v17.4s, v17.4s, v29.4s\n" + "smin v18.4s, v18.4s, v29.4s\n" + "smin v19.4s, v19.4s, v29.4s\n" + "smin v20.4s, v20.4s, v29.4s\n" + "smin v21.4s, v21.4s, v29.4s\n" + "smin v22.4s, v22.4s, v29.4s\n" + "smin v23.4s, v23.4s, v29.4s\n" + "smin v24.4s, v24.4s, v29.4s\n" + "smin v25.4s, v25.4s, v29.4s\n" + "smin v26.4s, v26.4s, v29.4s\n" + "smin v27.4s, v27.4s, v29.4s\n" + "smax v16.4s, v16.4s, v28.4s\n" + "smax v17.4s, v17.4s, v28.4s\n" + "smax v18.4s, v18.4s, v28.4s\n" + "smax v19.4s, v19.4s, v28.4s\n" + "smax v20.4s, v20.4s, v28.4s\n" + "smax v21.4s, v21.4s, v28.4s\n" + "smax v22.4s, v22.4s, v28.4s\n" + "smax v23.4s, v23.4s, v28.4s\n" + "smax v24.4s, v24.4s, v28.4s\n" + "smax v25.4s, v25.4s, v28.4s\n" + "smax v26.4s, v26.4s, v28.4s\n" + "smax v27.4s, v27.4s, v28.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v18.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 89f\n" "tbz x9, #3, 84f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x9, #2, 82f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x9, #1, 81f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 88f\n" "81:" // Height 3: Partial direct writeback: partial_1_12 "tbz x9, #0, 88f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 88f\n" "82:" // Height 3: Partial direct writeback: partial_2_8 "tbz x9, #1, 83f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 88f\n" "83:" // Height 3: Partial direct writeback: partial_1_8 "tbz x9, #0, 88f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 88f\n" "84:" // Height 3: Partial direct writeback: partial_4_0 "tbz x9, #2, 86f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x9, #1, 85f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 88f\n" "85:" // Height 3: Partial direct writeback: partial_1_4 "tbz x9, #0, 88f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 88f\n" "86:" // Height 3: Partial direct writeback: partial_2_0 "tbz x9, #1, 87f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x9, #0, 88f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 88f\n" "87:" // Height 3: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "88:" // Height 3: Partial direct writeback: Done "b 90f\n" "89:" // Height 3: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "90:" // Height 3: Writeback done "subs x9, x9, #0x10\n" "bgt 62b\n" @@ -1370,14 +1369,14 @@ void a64_hybrid_u8qa_dot_4x16 ( "94:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 95f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 96f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1387,9 +1386,9 @@ void a64_hybrid_u8qa_dot_4x16 ( "b 96f\n" "95:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "96:" // Height 4: input setup done "cmp x25, #0x10\n" "blt 101f\n" @@ -1614,29 +1613,29 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "103:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q6, [x28, #0x0]\n" - "ldr q7, [x28, #0x10]\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ldr q8, [x28, #0x20]\n" - "ldr q9, [x28, #0x30]\n" - ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n" + "ldr q5, [x28, #0x20]\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n" - ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" - ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n" - ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n" - ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n" - ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n" - ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n" - ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" + ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" "bge 102b\n" "104:" // Height 4: Multiply loop: Skip odd blocks "cbz x25, 108f\n" @@ -1663,73 +1662,73 @@ void a64_hybrid_u8qa_dot_4x16 ( ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n" "107:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n" - ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x28, #0x10]\n" + ".inst 0x6f80e0f0 // udot v16.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n" - ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n" - ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" + "ldr q4, [x28, #0x30]\n" + ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0fc // udot v28.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" "add x28, x28, #0x40\n" - ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n" + ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n" ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n" ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n" + ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n" + ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n" + ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n" "108:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 94b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "prfm pstl1keep, [x27, #0x0]\n" - "add x20, x21, x20\n" + "add x21, x22, x20\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" "prfm pstl1keep, [x21, #0x0]\n" - "prfm pstl1keep, [x20, #0x0]\n" "tbnz %x[flags], #31, 109f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "neg v4.4s, v4.4s\n" + "neg v0.4s, v0.4s\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v12.4s, v12.4s, v12.4s\n" "addp v13.4s, v13.4s, v13.4s\n" "addp v14.4s, v14.4s, v14.4s\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "109:" // Height 4: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" "add v16.4s, v16.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x10, #0x20]\n" + "ldr q2, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" "add v20.4s, v20.4s, v12.4s\n" "add v21.4s, v21.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "add v22.4s, v22.4s, v12.4s\n" "add v23.4s, v23.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" @@ -1740,100 +1739,100 @@ void a64_hybrid_u8qa_dot_4x16 ( "add v30.4s, v30.4s, v14.4s\n" "add v31.4s, v31.4s, v14.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v2.4s\n" "add v20.4s, v20.4s, v0.4s\n" - "add v21.4s, v21.4s, v1.4s\n" - "add v22.4s, v22.4s, v2.4s\n" - "add v23.4s, v23.4s, v3.4s\n" + "add v21.4s, v21.4s, v4.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v23.4s, v23.4s, v2.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v2.4s\n" "add v28.4s, v28.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v29.4s, v29.4s, v1.4s\n" - "add v30.4s, v30.4s, v2.4s\n" - "add v31.4s, v31.4s, v3.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v29.4s, v29.4s, v4.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v31.4s, v31.4s, v2.4s\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" "tbz %x[flags], #5, 110f\n" - "and v4.16b, v16.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v16.4s, v16.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "and v8.16b, v20.16b, v0.16b\n" - "and v9.16b, v21.16b, v0.16b\n" - "and v10.16b, v22.16b, v0.16b\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v16.16b, v0.16b\n" + "and v1.16b, v17.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v2.4s\n" + "sqadd v17.4s, v17.4s, v1.4s\n" + "and v7.16b, v18.16b, v0.16b\n" + "and v6.16b, v19.16b, v0.16b\n" + "and v5.16b, v20.16b, v0.16b\n" + "and v4.16b, v21.16b, v0.16b\n" + "and v3.16b, v22.16b, v0.16b\n" + "and v2.16b, v23.16b, v0.16b\n" + "and v1.16b, v24.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" - "sqadd v20.4s, v20.4s, v8.4s\n" - "sqadd v21.4s, v21.4s, v9.4s\n" - "sqadd v22.4s, v22.4s, v10.4s\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "and v9.16b, v28.16b, v0.16b\n" - "and v10.16b, v29.16b, v0.16b\n" - "and v4.16b, v30.16b, v0.16b\n" - "and v5.16b, v31.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v7.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "sqadd v20.4s, v20.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "sqadd v22.4s, v22.4s, v3.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "sqadd v24.4s, v24.4s, v1.4s\n" + "and v7.16b, v25.16b, v0.16b\n" + "and v6.16b, v26.16b, v0.16b\n" + "and v5.16b, v27.16b, v0.16b\n" + "and v4.16b, v28.16b, v0.16b\n" + "and v3.16b, v29.16b, v0.16b\n" + "and v2.16b, v30.16b, v0.16b\n" + "and v1.16b, v31.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" - "sqadd v28.4s, v28.4s, v9.4s\n" - "sqadd v29.4s, v29.4s, v10.4s\n" - "sqadd v30.4s, v30.4s, v4.4s\n" - "sqadd v31.4s, v31.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v25.4s, v25.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v6.4s\n" + "sqadd v27.4s, v27.4s, v5.4s\n" + "sqadd v28.4s, v28.4s, v4.4s\n" + "sqadd v29.4s, v29.4s, v3.4s\n" + "sqadd v30.4s, v30.4s, v2.4s\n" + "sqadd v31.4s, v31.4s, v1.4s\n" "110:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v3.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v2.4s }, [x20]\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v1.4s }, [x20]\n" "srshl v22.4s, v22.4s, v0.4s\n" "srshl v23.4s, v23.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1845,163 +1844,163 @@ void a64_hybrid_u8qa_dot_4x16 ( "srshl v29.4s, v29.4s, v0.4s\n" "srshl v30.4s, v30.4s, v0.4s\n" "srshl v31.4s, v31.4s, v0.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + "smin v29.4s, v29.4s, v2.4s\n" + "smin v30.4s, v30.4s, v2.4s\n" + "smin v31.4s, v31.4s, v2.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + "smax v29.4s, v29.4s, v1.4s\n" + "smax v30.4s, v30.4s, v1.4s\n" + "smax v31.4s, v31.4s, v1.4s\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v0.8h, v18.8h, v19.8h\n" "uzp1 v20.8h, v20.8h, v21.8h\n" - "uzp1 v21.8h, v22.8h, v23.8h\n" + "uzp1 v19.8h, v22.8h, v23.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v18.8h, v26.8h, v27.8h\n" "uzp1 v28.8h, v28.8h, v29.8h\n" - "uzp1 v29.8h, v30.8h, v31.8h\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v20.16b, v20.16b, v21.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" - "uzp1 v28.16b, v28.16b, v29.16b\n" + "uzp1 v17.8h, v30.8h, v31.8h\n" + "uzp1 v16.16b, v16.16b, v0.16b\n" + "uzp1 v20.16b, v20.16b, v19.16b\n" + "uzp1 v24.16b, v24.16b, v18.16b\n" + "uzp1 v28.16b, v28.16b, v17.16b\n" "bge 119f\n" "tbz x9, #3, 114f\n" "str d16, [x27], #0x8\n" - "str d20, [x22], #0x8\n" - "str d24, [x21], #0x8\n" - "str d28, [x20], #0x8\n" + "str d20, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "str d28, [x21], #0x8\n" "tbz x9, #2, 112f\n" "st1 { v16.s }[2], [x27], #0x4\n" - "st1 { v20.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" - "st1 { v28.s }[2], [x20], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" + "st1 { v28.s }[2], [x21], #0x4\n" "tbz x9, #1, 111f\n" "st1 { v16.h }[6], [x27], #0x2\n" - "st1 { v20.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" - "st1 { v28.h }[6], [x20], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" + "st1 { v28.h }[6], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[14], [x27]\n" - "st1 { v20.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" - "st1 { v28.b }[14], [x20]\n" + "st1 { v20.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" + "st1 { v28.b }[14], [x21]\n" "b 118f\n" "111:" // Height 4: Partial direct writeback: partial_1_12 "tbz x9, #0, 118f\n" "st1 { v16.b }[12], [x27]\n" - "st1 { v20.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" - "st1 { v28.b }[12], [x20]\n" + "st1 { v20.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" + "st1 { v28.b }[12], [x21]\n" "b 118f\n" "112:" // Height 4: Partial direct writeback: partial_2_8 "tbz x9, #1, 113f\n" "st1 { v16.h }[4], [x27], #0x2\n" - "st1 { v20.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" - "st1 { v28.h }[4], [x20], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" + "st1 { v28.h }[4], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[10], [x27]\n" - "st1 { v20.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" - "st1 { v28.b }[10], [x20]\n" + "st1 { v20.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" + "st1 { v28.b }[10], [x21]\n" "b 118f\n" "113:" // Height 4: Partial direct writeback: partial_1_8 "tbz x9, #0, 118f\n" "st1 { v16.b }[8], [x27]\n" - "st1 { v20.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" - "st1 { v28.b }[8], [x20]\n" + "st1 { v20.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" + "st1 { v28.b }[8], [x21]\n" "b 118f\n" "114:" // Height 4: Partial direct writeback: partial_4_0 "tbz x9, #2, 116f\n" "str s16, [x27], #0x4\n" - "str s20, [x22], #0x4\n" - "str s24, [x21], #0x4\n" - "str s28, [x20], #0x4\n" + "str s20, [x23], #0x4\n" + "str s24, [x22], #0x4\n" + "str s28, [x21], #0x4\n" "tbz x9, #1, 115f\n" "st1 { v16.h }[2], [x27], #0x2\n" - "st1 { v20.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" - "st1 { v28.h }[2], [x20], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" + "st1 { v28.h }[2], [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[6], [x27]\n" - "st1 { v20.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" - "st1 { v28.b }[6], [x20]\n" + "st1 { v20.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" + "st1 { v28.b }[6], [x21]\n" "b 118f\n" "115:" // Height 4: Partial direct writeback: partial_1_4 "tbz x9, #0, 118f\n" "st1 { v16.b }[4], [x27]\n" - "st1 { v20.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" - "st1 { v28.b }[4], [x20]\n" + "st1 { v20.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" + "st1 { v28.b }[4], [x21]\n" "b 118f\n" "116:" // Height 4: Partial direct writeback: partial_2_0 "tbz x9, #1, 117f\n" "str h16, [x27], #0x2\n" - "str h20, [x22], #0x2\n" - "str h24, [x21], #0x2\n" - "str h28, [x20], #0x2\n" + "str h20, [x23], #0x2\n" + "str h24, [x22], #0x2\n" + "str h28, [x21], #0x2\n" "tbz x9, #0, 118f\n" "st1 { v16.b }[2], [x27]\n" - "st1 { v20.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" - "st1 { v28.b }[2], [x20]\n" + "st1 { v20.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" + "st1 { v28.b }[2], [x21]\n" "b 118f\n" "117:" // Height 4: Partial direct writeback: partial_1_0 "str b16, [x27, #0x0]\n" - "str b20, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" - "str b28, [x20, #0x0]\n" + "str b20, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" + "str b28, [x21, #0x0]\n" "118:" // Height 4: Partial direct writeback: Done "b 120f\n" "119:" // Height 4: Full writeback "str q16, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q20, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" - "str q28, [x20, #0x0]\n" + "str q20, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" + "str q28, [x21, #0x0]\n" "120:" // Height 4: Writeback done "subs x9, x9, #0x10\n" "bgt 92b\n" @@ -2017,7 +2016,6 @@ void a64_hybrid_u8qa_dot_4x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "122:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp index 8a47701a4a..17e7405a0a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -81,7 +81,7 @@ public: case CPUModel::A510: return { 28.00 }; case CPUModel::V1: - return { 68.98 }; + return { 62.26 }; } } @@ -98,5 +98,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp index f808cb199d..1335b355ef 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp @@ -78,7 +78,6 @@ void a64_hybrid_u8qa_mmla_4x16 ( flags |= 0x20; } __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x4\n" "bge 97f\n" @@ -106,11 +105,11 @@ void a64_hybrid_u8qa_mmla_4x16 ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -131,35 +130,35 @@ void a64_hybrid_u8qa_mmla_4x16 ( "ldr q4, [x28, #0x60]\n" "blt 9f\n" "7:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v27.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q25, [x28, #0x70]\n" + "trn2 v1.2d, v1.2d, v27.2d\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" + ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" "add x28, x28, #0x100\n" - ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" - ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" + ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" + ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" + ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" + ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 8f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -177,36 +176,36 @@ void a64_hybrid_u8qa_mmla_4x16 ( "prfm pldl1keep, [x24, #0x80]\n" "bge 7b\n" "9:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v24.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" + "ldr q25, [x28, #0x70]\n" + "trn2 v1.2d, v1.2d, v24.2d\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" "sub x25, x25, #0x10\n" - ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" - ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" + ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" "add x24, x24, #0x10\n" "add x28, x28, #0x100\n" - ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" - ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" + ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" + ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" + ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" + ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 10f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -217,29 +216,29 @@ void a64_hybrid_u8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 14f\n" "12:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x24], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d25, [x24], #0x8\n" + "trn1 v0.2d, v25.2d, v24.2d\n" "tbnz %x[flags], #31, 13f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "13:" // Height 1: Multiply loop: unique 3: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" "sub x25, x25, #0x8\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" "cmp x25, #0x8\n" - ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" - "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" - ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" - ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" - ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n" + "ldr q27, [x28, #0x40]\n" + "ldr q26, [x28, #0x50]\n" + ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" + ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" + ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "bge 12b\n" "14:" // Height 1: Multiply loop: Skip odd blocks @@ -264,26 +263,26 @@ void a64_hybrid_u8qa_mmla_4x16 ( "17:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x24, #0x0]\n" "18:" // Height 1: Multiply loop: Ragged operand read: Done - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v0.2d, v1.2d, v24.2d\n" "tbnz %x[flags], #31, 19f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "19:" // Height 1: Multiply loop: unique 4: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" - ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" - ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" - ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" - ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" - ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + "ldr q25, [x28, #0x0]\n" + "ldr q24, [x28, #0x10]\n" + ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x40]\n" + "ldr q24, [x28, #0x50]\n" + ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "20:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -297,75 +296,75 @@ void a64_hybrid_u8qa_mmla_4x16 ( "uzp1 v19.2d, v19.2d, v23.2d\n" "mov v23.16b, v16.16b\n" "tbnz %x[flags], #31, 21f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v1.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v16.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v1.4s, v1.4s\n" + "neg v16.4s, v16.4s\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v1.4s\n" + "mul v11.4s, v11.4s, v16.4s\n" "21:" // Height 1: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q24, [x10, #0x0]\n" + "ldr q22, [x10, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v17.4s, v17.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q21, [x10, #0x20]\n" + "ldr q20, [x10, #0x30]\n" "add v18.4s, v18.4s, v11.4s\n" "add v19.4s, v19.4s, v11.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v16.4s }, [x20]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1r { v0.4s }, [x23]\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" + "add v23.4s, v23.4s, v24.4s\n" + "add v17.4s, v17.4s, v22.4s\n" + "add v18.4s, v18.4s, v21.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1r { v0.4s }, [x20]\n" + "add v19.4s, v19.4s, v20.4s\n" + "sqrdmulh v23.4s, v23.4s, v16.4s\n" "add x10, x10, #0x40\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "sqrdmulh v17.4s, v17.4s, v16.4s\n" + "sqrdmulh v18.4s, v18.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v16.4s\n" "tbz %x[flags], #5, 22f\n" - "and v4.16b, v23.16b, v0.16b\n" - "and v5.16b, v17.16b, v0.16b\n" - "and v6.16b, v18.16b, v0.16b\n" - "and v7.16b, v19.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "sqadd v17.4s, v17.4s, v5.4s\n" - "sqadd v18.4s, v18.4s, v6.4s\n" - "sqadd v19.4s, v19.4s, v7.4s\n" + "and v22.16b, v23.16b, v0.16b\n" + "and v21.16b, v17.16b, v0.16b\n" + "and v20.16b, v18.16b, v0.16b\n" + "and v16.16b, v19.16b, v0.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v22.4s\n" + "sqadd v17.4s, v17.4s, v21.4s\n" + "sqadd v18.4s, v18.4s, v20.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" "22:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v21.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v20.4s }, [x20]\n" + "add v23.4s, v23.4s, v21.4s\n" + "add v17.4s, v17.4s, v21.4s\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v16.4s }, [x20]\n" + "add v18.4s, v18.4s, v21.4s\n" + "add v19.4s, v19.4s, v21.4s\n" "cmp x9, #0x10\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "smin v23.4s, v23.4s, v20.4s\n" + "smin v17.4s, v17.4s, v20.4s\n" + "smin v18.4s, v18.4s, v20.4s\n" + "smin v19.4s, v19.4s, v20.4s\n" + "smax v23.4s, v23.4s, v16.4s\n" + "smax v17.4s, v17.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smax v19.4s, v19.4s, v16.4s\n" "uzp1 v23.8h, v23.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" - "uzp1 v23.16b, v23.16b, v17.16b\n" + "uzp1 v16.8h, v18.8h, v19.8h\n" + "uzp1 v23.16b, v23.16b, v16.16b\n" "bge 31f\n" "tbz x9, #3, 26f\n" "str d23, [x27], #0x8\n" @@ -442,12 +441,12 @@ void a64_hybrid_u8qa_mmla_4x16 ( "36:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 37f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 38f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -455,7 +454,7 @@ void a64_hybrid_u8qa_mmla_4x16 ( "b 38f\n" "37:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "38:" // Height 2: input setup done "cmp x25, #0x10\n" "blt 43f\n" @@ -473,34 +472,34 @@ void a64_hybrid_u8qa_mmla_4x16 ( "39:" // Height 2: Multiply loop: Main loop head "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" - ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" + ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" "add x23, x23, #0x10\n" - ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" + ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" "add x28, x28, #0x100\n" - ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" + ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" + ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" + ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 40f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -522,35 +521,35 @@ void a64_hybrid_u8qa_mmla_4x16 ( "41:" // Height 2: Multiply loop: Single iteration only "trn1 v0.2d, v1.2d, v2.2d\n" ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q25, [x28, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q24, [x28, #0x80]\n" "trn2 v1.2d, v1.2d, v2.2d\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" - "ldr q7, [x28, #0x90]\n" + "ldr q30, [x28, #0x90]\n" ".inst 0x6e88a415 // ummla v21.4s, v0.16b, v8.16b\n" - "ldr q8, [x28, #0xa0]\n" + "ldr q29, [x28, #0xa0]\n" ".inst 0x6e89a412 // ummla v18.4s, v0.16b, v9.16b\n" - "ldr q9, [x28, #0xb0]\n" + "ldr q28, [x28, #0xb0]\n" ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" - "ldr q10, [x28, #0xc0]\n" + "ldr q27, [x28, #0xc0]\n" ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + "ldr q26, [x28, #0xd0]\n" + ".inst 0x6e99a417 // ummla v23.4s, v0.16b, v25.16b\n" + "ldr q25, [x28, #0xe0]\n" + ".inst 0x6e98a430 // ummla v16.4s, v1.16b, v24.16b\n" + "ldr q24, [x28, #0xf0]\n" "sub x25, x25, #0x10\n" - ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" - ".inst 0x6e88a431 // ummla v17.4s, v1.16b, v8.16b\n" + ".inst 0x6e9ea434 // ummla v20.4s, v1.16b, v30.16b\n" + ".inst 0x6e9da431 // ummla v17.4s, v1.16b, v29.16b\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x6e89a435 // ummla v21.4s, v1.16b, v9.16b\n" - ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" + ".inst 0x6e9ca435 // ummla v21.4s, v1.16b, v28.16b\n" + ".inst 0x6e9ba432 // ummla v18.4s, v1.16b, v27.16b\n" "add x28, x28, #0x100\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa436 // ummla v22.4s, v1.16b, v26.16b\n" + ".inst 0x6e99a433 // ummla v19.4s, v1.16b, v25.16b\n" + ".inst 0x6e98a437 // ummla v23.4s, v1.16b, v24.16b\n" "tbnz %x[flags], #31, 42f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f942b // udot v11.4s, v1.16b, v15.16b\n" @@ -562,30 +561,30 @@ void a64_hybrid_u8qa_mmla_4x16 ( "cmp x25, #0x8\n" "blt 46f\n" "44:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "trn1 v0.2d, v25.2d, v24.2d\n" "tbnz %x[flags], #31, 45f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "45:" // Height 2: Multiply loop: unique 7: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" + "ldr q24, [x28, #0x0]\n" + "ldr q26, [x28, #0x10]\n" + ".inst 0x6e98a410 // ummla v16.4s, v0.16b, v24.16b\n" "sub x25, x25, #0x8\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" "cmp x25, #0x8\n" - ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" - "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" - ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" - ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" - ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" + ".inst 0x6e9aa414 // ummla v20.4s, v0.16b, v26.16b\n" + "ldr q27, [x28, #0x40]\n" + "ldr q26, [x28, #0x50]\n" + ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x6e9ba412 // ummla v18.4s, v0.16b, v27.16b\n" + ".inst 0x6e9aa416 // ummla v22.4s, v0.16b, v26.16b\n" + ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "bge 44b\n" "46:" // Height 2: Multiply loop: Skip odd blocks @@ -621,22 +620,22 @@ void a64_hybrid_u8qa_mmla_4x16 ( "tbnz %x[flags], #31, 51f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" "51:" // Height 2: Multiply loop: unique 8: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" - ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" - "ldr q5, [x28, #0x20]\n" - "ldr q6, [x28, #0x30]\n" - ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" - ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" - ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" - ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" - ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" + "ldr q25, [x28, #0x0]\n" + "ldr q24, [x28, #0x10]\n" + ".inst 0x6e99a410 // ummla v16.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a414 // ummla v20.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x20]\n" + "ldr q24, [x28, #0x30]\n" + ".inst 0x6e99a411 // ummla v17.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a415 // ummla v21.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x40]\n" + "ldr q24, [x28, #0x50]\n" + ".inst 0x6e99a412 // ummla v18.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a416 // ummla v22.4s, v0.16b, v24.16b\n" + "ldr q25, [x28, #0x60]\n" + "ldr q24, [x28, #0x70]\n" + ".inst 0x6e99a413 // ummla v19.4s, v0.16b, v25.16b\n" + ".inst 0x6e98a417 // ummla v23.4s, v0.16b, v24.16b\n" "add x28, x28, #0x80\n" "52:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -644,127 +643,127 @@ void a64_hybrid_u8qa_mmla_4x16 ( "cmp x26, x20\n" "bne 36b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" + "uzp1 v24.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "uzp1 v21.2d, v18.2d, v22.2d\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" - "mov v23.16b, v4.16b\n" + "mov v23.16b, v24.16b\n" "tbnz %x[flags], #31, 53f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v2.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v24.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" - "neg v2.4s, v2.4s\n" + "neg v24.4s, v24.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" - "mul v11.4s, v11.4s, v2.4s\n" - "mul v12.4s, v12.4s, v2.4s\n" + "mul v11.4s, v11.4s, v24.4s\n" + "mul v12.4s, v12.4s, v24.4s\n" "53:" // Height 2: skip row sum fixup - "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q28, [x10, #0x0]\n" + "ldr q27, [x10, #0x10]\n" "add v23.4s, v23.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q26, [x10, #0x20]\n" + "ldr q25, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v24.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add v23.4s, v23.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add v23.4s, v23.4s, v28.4s\n" + "add v20.4s, v20.4s, v27.4s\n" "add x10, x10, #0x40\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" - "add v16.4s, v16.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v25.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v17.4s, v17.4s, v27.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v25.4s\n" + "sqrdmulh v23.4s, v23.4s, v24.4s\n" + "sqrdmulh v20.4s, v20.4s, v24.4s\n" + "sqrdmulh v21.4s, v21.4s, v24.4s\n" + "sqrdmulh v22.4s, v22.4s, v24.4s\n" + "sqrdmulh v16.4s, v16.4s, v24.4s\n" + "sqrdmulh v17.4s, v17.4s, v24.4s\n" + "sqrdmulh v18.4s, v18.4s, v24.4s\n" + "sqrdmulh v19.4s, v19.4s, v24.4s\n" "tbz %x[flags], #5, 54f\n" - "and v4.16b, v23.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v23.4s, v23.4s, v4.4s\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" + "and v24.16b, v23.16b, v0.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v24.4s\n" + "and v30.16b, v20.16b, v0.16b\n" + "and v29.16b, v21.16b, v0.16b\n" + "and v28.16b, v22.16b, v0.16b\n" + "and v27.16b, v16.16b, v0.16b\n" + "and v26.16b, v17.16b, v0.16b\n" + "and v25.16b, v18.16b, v0.16b\n" + "and v24.16b, v19.16b, v0.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "sqadd v22.4s, v22.4s, v28.4s\n" + "sqadd v16.4s, v16.4s, v27.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "sqadd v18.4s, v18.4s, v25.4s\n" + "sqadd v19.4s, v19.4s, v24.4s\n" "54:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v26.4s }, [x20]\n" "srshl v23.4s, v23.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v25.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v24.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" + "add v23.4s, v23.4s, v26.4s\n" + "add v20.4s, v20.4s, v26.4s\n" + "add v21.4s, v21.4s, v26.4s\n" + "add v22.4s, v22.4s, v26.4s\n" + "add v16.4s, v16.4s, v26.4s\n" + "add v17.4s, v17.4s, v26.4s\n" + "add v18.4s, v18.4s, v26.4s\n" + "add v19.4s, v19.4s, v26.4s\n" + "smin v23.4s, v23.4s, v25.4s\n" + "smin v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v25.4s\n" + "smin v22.4s, v22.4s, v25.4s\n" + "smin v16.4s, v16.4s, v25.4s\n" + "smin v17.4s, v17.4s, v25.4s\n" + "smin v18.4s, v18.4s, v25.4s\n" + "smin v19.4s, v19.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smax v22.4s, v22.4s, v24.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smax v18.4s, v18.4s, v24.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" "uzp1 v23.8h, v23.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" @@ -774,68 +773,68 @@ void a64_hybrid_u8qa_mmla_4x16 ( "bge 63f\n" "tbz x9, #3, 58f\n" "str d23, [x27], #0x8\n" - "str d16, [x22], #0x8\n" + "str d16, [x23], #0x8\n" "tbz x9, #2, 56f\n" "st1 { v23.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" "tbz x9, #1, 55f\n" "st1 { v23.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" + "st1 { v16.b }[14], [x23]\n" "b 62f\n" "55:" // Height 2: Partial direct writeback: partial_1_12 "tbz x9, #0, 62f\n" "st1 { v23.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" + "st1 { v16.b }[12], [x23]\n" "b 62f\n" "56:" // Height 2: Partial direct writeback: partial_2_8 "tbz x9, #1, 57f\n" "st1 { v23.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" + "st1 { v16.b }[10], [x23]\n" "b 62f\n" "57:" // Height 2: Partial direct writeback: partial_1_8 "tbz x9, #0, 62f\n" "st1 { v23.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" + "st1 { v16.b }[8], [x23]\n" "b 62f\n" "58:" // Height 2: Partial direct writeback: partial_4_0 "tbz x9, #2, 60f\n" "str s23, [x27], #0x4\n" - "str s16, [x22], #0x4\n" + "str s16, [x23], #0x4\n" "tbz x9, #1, 59f\n" "st1 { v23.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" + "st1 { v16.b }[6], [x23]\n" "b 62f\n" "59:" // Height 2: Partial direct writeback: partial_1_4 "tbz x9, #0, 62f\n" "st1 { v23.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" + "st1 { v16.b }[4], [x23]\n" "b 62f\n" "60:" // Height 2: Partial direct writeback: partial_2_0 "tbz x9, #1, 61f\n" "str h23, [x27], #0x2\n" - "str h16, [x22], #0x2\n" + "str h16, [x23], #0x2\n" "tbz x9, #0, 62f\n" "st1 { v23.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" + "st1 { v16.b }[2], [x23]\n" "b 62f\n" "61:" // Height 2: Partial direct writeback: partial_1_0 "str b23, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" + "str b16, [x23, #0x0]\n" "62:" // Height 2: Partial direct writeback: Done "b 64f\n" "63:" // Height 2: Full writeback "str q23, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" + "str q16, [x23, #0x0]\n" "64:" // Height 2: Writeback done "subs x9, x9, #0x10\n" "bgt 34b\n" @@ -872,13 +871,13 @@ void a64_hybrid_u8qa_mmla_4x16 ( "68:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 69f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 70f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -887,8 +886,8 @@ void a64_hybrid_u8qa_mmla_4x16 ( "b 70f\n" "69:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "70:" // Height 3: input setup done "cmp x25, #0x10\n" "blt 75f\n" @@ -909,12 +908,12 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q14, [x28, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q5, [x28, #0x60]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q4, [x28, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" "ldr q7, [x28, #0x90]\n" @@ -930,15 +929,15 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e8aa416 // ummla v22.4s, v0.16b, v10.16b\n" ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" "ldr q10, [x28, #0xc0]\n" - ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n" + ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n" "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n" + "ldr q4, [x28, #0xf0]\n" "add x28, x28, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" @@ -948,12 +947,12 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n" "tbnz %x[flags], #31, 72f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" @@ -981,12 +980,12 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e85a410 // ummla v16.4s, v0.16b, v5.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - "ldr q5, [x28, #0x70]\n" + "ldr q14, [x28, #0x70]\n" ".inst 0x6e86a414 // ummla v20.4s, v0.16b, v6.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - "ldr q4, [x28, #0x60]\n" + "ldr q5, [x28, #0x60]\n" ".inst 0x6e86a45c // ummla v28.4s, v2.16b, v6.16b\n" - "ldr q6, [x28, #0x80]\n" + "ldr q4, [x28, #0x80]\n" ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" "ldr q7, [x28, #0x90]\n" @@ -1003,15 +1002,15 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e8aa45e // ummla v30.4s, v2.16b, v10.16b\n" "ldr q10, [x28, #0xc0]\n" "add x22, x22, #0x10\n" - ".inst 0x6e84a413 // ummla v19.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45b // ummla v27.4s, v2.16b, v4.16b\n" - "ldr q4, [x28, #0xd0]\n" - ".inst 0x6e85a417 // ummla v23.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a45f // ummla v31.4s, v2.16b, v5.16b\n" + ".inst 0x6e85a413 // ummla v19.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45b // ummla v27.4s, v2.16b, v5.16b\n" + "ldr q6, [x28, #0xd0]\n" + ".inst 0x6e8ea417 // ummla v23.4s, v0.16b, v14.16b\n" + ".inst 0x6e8ea45f // ummla v31.4s, v2.16b, v14.16b\n" "ldr q5, [x28, #0xe0]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a478 // ummla v24.4s, v3.16b, v6.16b\n" - "ldr q6, [x28, #0xf0]\n" + ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a478 // ummla v24.4s, v3.16b, v4.16b\n" + "ldr q4, [x28, #0xf0]\n" "add x28, x28, #0x100\n" ".inst 0x6e87a434 // ummla v20.4s, v1.16b, v7.16b\n" ".inst 0x6e87a47c // ummla v28.4s, v3.16b, v7.16b\n" @@ -1021,12 +1020,12 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e89a47d // ummla v29.4s, v3.16b, v9.16b\n" ".inst 0x6e8aa432 // ummla v18.4s, v1.16b, v10.16b\n" ".inst 0x6e8aa47a // ummla v26.4s, v3.16b, v10.16b\n" - ".inst 0x6e84a436 // ummla v22.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" + ".inst 0x6e86a436 // ummla v22.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" - ".inst 0x6e86a437 // ummla v23.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a47f // ummla v31.4s, v3.16b, v6.16b\n" + ".inst 0x6e84a437 // ummla v23.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n" "tbnz %x[flags], #31, 74f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" @@ -1042,41 +1041,41 @@ void a64_hybrid_u8qa_mmla_4x16 ( "blt 78f\n" "76:" // Height 3: Multiply loop: Odd block loop "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x22], #0x8\n" - "trn1 v2.2d, v3.2d, v7.2d\n" + "ldr d0, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v0.2d\n" + "ldr d1, [x22], #0x8\n" + "trn1 v2.2d, v1.2d, v2.2d\n" "tbnz %x[flags], #31, 77f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "77:" // Height 3: Multiply loop: unique 11: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" + ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" "sub x25, x25, #0x8\n" "cmp x25, #0x8\n" "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" - ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" - ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" "add x28, x28, #0x80\n" ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" - ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" + ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" + ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" "bge 76b\n" "78:" // Height 3: Multiply loop: Skip odd blocks "cbz x25, 84f\n" @@ -1115,52 +1114,52 @@ void a64_hybrid_u8qa_mmla_4x16 ( "ldr b3, [x22, #0x0]\n" "82:" // Height 3: Multiply loop: Ragged operand read: Done "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v9.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" "tbnz %x[flags], #31, 83f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "83:" // Height 3: Multiply loop: unique 12: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" - "ldr q5, [x28, #0x20]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q3, [x28, #0x10]\n" + ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x28, #0x20]\n" "ldr q6, [x28, #0x30]\n" - ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" + ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n" + "ldr q5, [x28, #0x40]\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" "add x28, x28, #0x80\n" - ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" - ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" - ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" - ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" + ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" + ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" "84:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 68b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "uzp1 v0.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" - "prfm pstl1keep, [x21, #0x0]\n" + "prfm pstl1keep, [x22, #0x0]\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" "uzp2 v19.2d, v19.2d, v23.2d\n" @@ -1168,116 +1167,116 @@ void a64_hybrid_u8qa_mmla_4x16 ( "uzp1 v25.2d, v25.2d, v29.2d\n" "uzp1 v26.2d, v26.2d, v30.2d\n" "uzp1 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v4.16b\n" + "mov v31.16b, v0.16b\n" "tbnz %x[flags], #31, 85f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v3.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v23.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v3.4s, v3.4s\n" + "neg v23.4s, v23.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v3.4s\n" - "mul v12.4s, v12.4s, v3.4s\n" - "mul v13.4s, v13.4s, v3.4s\n" + "mul v11.4s, v11.4s, v23.4s\n" + "mul v12.4s, v12.4s, v23.4s\n" + "mul v13.4s, v13.4s, v23.4s\n" "85:" // Height 3: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q30, [x10, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q29, [x10, #0x20]\n" + "ldr q28, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v23.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v24.4s, v24.4s, v13.4s\n" "add v25.4s, v25.4s, v13.4s\n" "add x10, x10, #0x40\n" "add v26.4s, v26.4s, v13.4s\n" "add v27.4s, v27.4s, v13.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" + "add v20.4s, v20.4s, v30.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v28.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v30.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v28.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "sqrdmulh v31.4s, v31.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v21.4s, v21.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sqrdmulh v16.4s, v16.4s, v23.4s\n" + "sqrdmulh v17.4s, v17.4s, v23.4s\n" + "sqrdmulh v18.4s, v18.4s, v23.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqrdmulh v24.4s, v24.4s, v23.4s\n" + "sqrdmulh v25.4s, v25.4s, v23.4s\n" + "sqrdmulh v26.4s, v26.4s, v23.4s\n" + "sqrdmulh v27.4s, v27.4s, v23.4s\n" "tbz %x[flags], #5, 86f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "and v5.16b, v24.16b, v0.16b\n" - "and v6.16b, v25.16b, v0.16b\n" - "and v7.16b, v26.16b, v0.16b\n" - "and v8.16b, v27.16b, v0.16b\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "sqadd v24.4s, v24.4s, v5.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "sqadd v26.4s, v26.4s, v7.4s\n" - "sqadd v27.4s, v27.4s, v8.4s\n" + "and v1.16b, v31.16b, v0.16b\n" + "and v30.16b, v20.16b, v0.16b\n" + "and v29.16b, v21.16b, v0.16b\n" + "and v28.16b, v22.16b, v0.16b\n" + "and v23.16b, v16.16b, v0.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v1.4s\n" + "sqadd v20.4s, v20.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "sqadd v22.4s, v22.4s, v28.4s\n" + "sqadd v16.4s, v16.4s, v23.4s\n" + "and v3.16b, v17.16b, v0.16b\n" + "and v2.16b, v18.16b, v0.16b\n" + "and v1.16b, v19.16b, v0.16b\n" + "and v30.16b, v24.16b, v0.16b\n" + "and v29.16b, v25.16b, v0.16b\n" + "and v28.16b, v26.16b, v0.16b\n" + "and v23.16b, v27.16b, v0.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v18.4s, v18.4s, v2.4s\n" + "sqadd v19.4s, v19.4s, v1.4s\n" + "sqadd v24.4s, v24.4s, v30.4s\n" + "sqadd v25.4s, v25.4s, v29.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "sqadd v27.4s, v27.4s, v23.4s\n" "86:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v29.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v28.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v23.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1285,132 +1284,132 @@ void a64_hybrid_u8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v29.4s\n" + "add v20.4s, v20.4s, v29.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v16.4s, v16.4s, v29.4s\n" + "add v17.4s, v17.4s, v29.4s\n" + "add v18.4s, v18.4s, v29.4s\n" + "add v19.4s, v19.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v27.4s, v27.4s, v29.4s\n" + "smin v31.4s, v31.4s, v28.4s\n" + "smin v20.4s, v20.4s, v28.4s\n" + "smin v21.4s, v21.4s, v28.4s\n" + "smin v22.4s, v22.4s, v28.4s\n" + "smin v16.4s, v16.4s, v28.4s\n" + "smin v17.4s, v17.4s, v28.4s\n" + "smin v18.4s, v18.4s, v28.4s\n" + "smin v19.4s, v19.4s, v28.4s\n" + "smin v24.4s, v24.4s, v28.4s\n" + "smin v25.4s, v25.4s, v28.4s\n" + "smin v26.4s, v26.4s, v28.4s\n" + "smin v27.4s, v27.4s, v28.4s\n" + "smax v31.4s, v31.4s, v23.4s\n" + "smax v20.4s, v20.4s, v23.4s\n" + "smax v21.4s, v21.4s, v23.4s\n" + "smax v22.4s, v22.4s, v23.4s\n" + "smax v16.4s, v16.4s, v23.4s\n" + "smax v17.4s, v17.4s, v23.4s\n" + "smax v18.4s, v18.4s, v23.4s\n" + "smax v19.4s, v19.4s, v23.4s\n" + "smax v24.4s, v24.4s, v23.4s\n" + "smax v25.4s, v25.4s, v23.4s\n" + "smax v26.4s, v26.4s, v23.4s\n" + "smax v27.4s, v27.4s, v23.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v18.8h, v18.8h, v19.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 95f\n" "tbz x9, #3, 90f\n" "str d31, [x27], #0x8\n" - "str d16, [x22], #0x8\n" - "str d24, [x21], #0x8\n" + "str d16, [x23], #0x8\n" + "str d24, [x22], #0x8\n" "tbz x9, #2, 88f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v24.s }[2], [x21], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v24.s }[2], [x22], #0x4\n" "tbz x9, #1, 87f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v24.h }[6], [x21], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v24.h }[6], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v24.b }[14], [x21]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v24.b }[14], [x22]\n" "b 94f\n" "87:" // Height 3: Partial direct writeback: partial_1_12 "tbz x9, #0, 94f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v24.b }[12], [x21]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v24.b }[12], [x22]\n" "b 94f\n" "88:" // Height 3: Partial direct writeback: partial_2_8 "tbz x9, #1, 89f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v24.h }[4], [x21], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v24.h }[4], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v24.b }[10], [x21]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v24.b }[10], [x22]\n" "b 94f\n" "89:" // Height 3: Partial direct writeback: partial_1_8 "tbz x9, #0, 94f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v24.b }[8], [x21]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v24.b }[8], [x22]\n" "b 94f\n" "90:" // Height 3: Partial direct writeback: partial_4_0 "tbz x9, #2, 92f\n" "str s31, [x27], #0x4\n" - "str s16, [x22], #0x4\n" - "str s24, [x21], #0x4\n" + "str s16, [x23], #0x4\n" + "str s24, [x22], #0x4\n" "tbz x9, #1, 91f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v24.h }[2], [x21], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v24.h }[2], [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v24.b }[6], [x21]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v24.b }[6], [x22]\n" "b 94f\n" "91:" // Height 3: Partial direct writeback: partial_1_4 "tbz x9, #0, 94f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v24.b }[4], [x21]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v24.b }[4], [x22]\n" "b 94f\n" "92:" // Height 3: Partial direct writeback: partial_2_0 "tbz x9, #1, 93f\n" "str h31, [x27], #0x2\n" - "str h16, [x22], #0x2\n" - "str h24, [x21], #0x2\n" + "str h16, [x23], #0x2\n" + "str h24, [x22], #0x2\n" "tbz x9, #0, 94f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v24.b }[2], [x21]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v24.b }[2], [x22]\n" "b 94f\n" "93:" // Height 3: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b24, [x21, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b24, [x22, #0x0]\n" "94:" // Height 3: Partial direct writeback: Done "b 96f\n" "95:" // Height 3: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" - "str q24, [x21, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q24, [x22, #0x0]\n" "96:" // Height 3: Writeback done "subs x9, x9, #0x10\n" "bgt 66b\n" @@ -1451,14 +1450,14 @@ void a64_hybrid_u8qa_mmla_4x16 ( "100:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 101f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 102f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1468,9 +1467,9 @@ void a64_hybrid_u8qa_mmla_4x16 ( "b 102f\n" "101:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "102:" // Height 4: input setup done "cmp x25, #0x10\n" "blt 107f\n" @@ -1630,42 +1629,42 @@ void a64_hybrid_u8qa_mmla_4x16 ( "blt 110f\n" "108:" // Height 4: Multiply loop: Odd block loop "ldr d1, [x24], #0x8\n" - "ldr d2, [x23], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v2.2d, v3.2d, v7.2d\n" + "ldr d0, [x23], #0x8\n" + "trn1 v0.2d, v1.2d, v0.2d\n" + "ldr d2, [x22], #0x8\n" + "ldr d1, [x21], #0x8\n" + "trn1 v2.2d, v2.2d, v1.2d\n" "tbnz %x[flags], #31, 109f\n" ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "109:" // Height 4: Multiply loop: unique 15: skip row sum - "ldr q8, [x28, #0x0]\n" - "ldr q9, [x28, #0x10]\n" - ".inst 0x6e88a410 // ummla v16.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a458 // ummla v24.4s, v2.16b, v8.16b\n" - "ldr q10, [x28, #0x20]\n" - "ldr q4, [x28, #0x30]\n" + "ldr q3, [x28, #0x0]\n" + "ldr q1, [x28, #0x10]\n" + ".inst 0x6e83a410 // ummla v16.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x28, #0x30]\n" "sub x25, x25, #0x8\n" "cmp x25, #0x8\n" "ldr q5, [x28, #0x40]\n" - "ldr q6, [x28, #0x50]\n" - ".inst 0x6e89a414 // ummla v20.4s, v0.16b, v9.16b\n" - ".inst 0x6e89a45c // ummla v28.4s, v2.16b, v9.16b\n" - "ldr q7, [x28, #0x60]\n" - "ldr q8, [x28, #0x70]\n" - ".inst 0x6e8aa411 // ummla v17.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa459 // ummla v25.4s, v2.16b, v10.16b\n" - ".inst 0x6e84a415 // ummla v21.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45d // ummla v29.4s, v2.16b, v4.16b\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e81a414 // ummla v20.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" + ".inst 0x6e87a411 // ummla v17.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" "add x28, x28, #0x80\n" ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" - ".inst 0x6e86a416 // ummla v22.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a45e // ummla v30.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a413 // ummla v19.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a45b // ummla v27.4s, v2.16b, v7.16b\n" - ".inst 0x6e88a417 // ummla v23.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45f // ummla v31.4s, v2.16b, v8.16b\n" + ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" + ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" + ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" "bge 108b\n" "110:" // Height 4: Multiply loop: Skip odd blocks "cbz x25, 116f\n" @@ -1716,51 +1715,51 @@ void a64_hybrid_u8qa_mmla_4x16 ( ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n" ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n" "115:" // Height 4: Multiply loop: unique 16: skip row sum - "ldr q10, [x28, #0x0]\n" - "ldr q4, [x28, #0x10]\n" - ".inst 0x6e8aa410 // ummla v16.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa458 // ummla v24.4s, v2.16b, v10.16b\n" - "ldr q5, [x28, #0x20]\n" + "ldr q1, [x28, #0x0]\n" + "ldr q3, [x28, #0x10]\n" + ".inst 0x6e81a410 // ummla v16.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x28, #0x20]\n" "ldr q6, [x28, #0x30]\n" - ".inst 0x6e84a414 // ummla v20.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a45c // ummla v28.4s, v2.16b, v4.16b\n" - "ldr q7, [x28, #0x40]\n" - "ldr q8, [x28, #0x50]\n" - ".inst 0x6e85a411 // ummla v17.4s, v0.16b, v5.16b\n" - ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" - "ldr q9, [x28, #0x60]\n" - "ldr q10, [x28, #0x70]\n" + ".inst 0x6e83a414 // ummla v20.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45c // ummla v28.4s, v2.16b, v3.16b\n" + "ldr q5, [x28, #0x40]\n" + "ldr q4, [x28, #0x50]\n" + ".inst 0x6e81a411 // ummla v17.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n" + "ldr q3, [x28, #0x60]\n" + "ldr q1, [x28, #0x70]\n" ".inst 0x6e86a415 // ummla v21.4s, v0.16b, v6.16b\n" ".inst 0x6e86a45d // ummla v29.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a412 // ummla v18.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a45a // ummla v26.4s, v2.16b, v7.16b\n" + ".inst 0x6e85a412 // ummla v18.4s, v0.16b, v5.16b\n" + ".inst 0x6e85a45a // ummla v26.4s, v2.16b, v5.16b\n" "add x28, x28, #0x80\n" - ".inst 0x6e88a416 // ummla v22.4s, v0.16b, v8.16b\n" - ".inst 0x6e88a45e // ummla v30.4s, v2.16b, v8.16b\n" - ".inst 0x6e89a413 // ummla v19.4s, v0.16b, v9.16b\n" - ".inst 0x6e89a45b // ummla v27.4s, v2.16b, v9.16b\n" - ".inst 0x6e8aa417 // ummla v23.4s, v0.16b, v10.16b\n" - ".inst 0x6e8aa45f // ummla v31.4s, v2.16b, v10.16b\n" + ".inst 0x6e84a416 // ummla v22.4s, v0.16b, v4.16b\n" + ".inst 0x6e84a45e // ummla v30.4s, v2.16b, v4.16b\n" + ".inst 0x6e83a413 // ummla v19.4s, v0.16b, v3.16b\n" + ".inst 0x6e83a45b // ummla v27.4s, v2.16b, v3.16b\n" + ".inst 0x6e81a417 // ummla v23.4s, v0.16b, v1.16b\n" + ".inst 0x6e81a45f // ummla v31.4s, v2.16b, v1.16b\n" "116:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 100b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v4.2d, v16.2d, v20.2d\n" - "add x22, x27, x20\n" + "uzp1 v0.2d, v16.2d, v20.2d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "add x21, x22, x20\n" - "add x20, x21, x20\n" "uzp2 v16.2d, v16.2d, v20.2d\n" "uzp1 v20.2d, v17.2d, v21.2d\n" "prfm pstl1keep, [x27, #0x0]\n" "uzp2 v17.2d, v17.2d, v21.2d\n" "uzp1 v21.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" "prfm pstl1keep, [x22, #0x0]\n" - "prfm pstl1keep, [x21, #0x0]\n" "uzp2 v18.2d, v18.2d, v22.2d\n" "uzp1 v22.2d, v19.2d, v23.2d\n" - "prfm pstl1keep, [x20, #0x0]\n" + "prfm pstl1keep, [x21, #0x0]\n" "uzp2 v19.2d, v19.2d, v23.2d\n" "uzp1 v23.2d, v24.2d, v28.2d\n" "uzp2 v24.2d, v24.2d, v28.2d\n" @@ -1770,38 +1769,38 @@ void a64_hybrid_u8qa_mmla_4x16 ( "uzp2 v26.2d, v26.2d, v30.2d\n" "uzp1 v30.2d, v27.2d, v31.2d\n" "uzp2 v27.2d, v27.2d, v31.2d\n" - "mov v31.16b, v4.16b\n" + "mov v31.16b, v0.16b\n" "tbnz %x[flags], #31, 117f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1r { v0.4s }, [x20]\n" "addp v11.4s, v11.4s, v11.4s\n" "addp v13.4s, v13.4s, v13.4s\n" - "neg v4.4s, v4.4s\n" + "neg v0.4s, v0.4s\n" "dup v12.4s, v11.s[3]\n" "dup v11.4s, v11.s[0]\n" "dup v14.4s, v13.s[3]\n" "dup v13.4s, v13.s[0]\n" - "mul v11.4s, v11.4s, v4.4s\n" - "mul v12.4s, v12.4s, v4.4s\n" - "mul v13.4s, v13.4s, v4.4s\n" - "mul v14.4s, v14.4s, v4.4s\n" + "mul v11.4s, v11.4s, v0.4s\n" + "mul v12.4s, v12.4s, v0.4s\n" + "mul v13.4s, v13.4s, v0.4s\n" + "mul v14.4s, v14.4s, v0.4s\n" "117:" // Height 4: skip row sum fixup "ldr q0, [x10, #0x0]\n" - "ldr q1, [x10, #0x10]\n" + "ldr q4, [x10, #0x10]\n" "add v31.4s, v31.4s, v11.4s\n" "add v20.4s, v20.4s, v11.4s\n" - "ldr q2, [x10, #0x20]\n" - "ldr q3, [x10, #0x30]\n" + "ldr q3, [x10, #0x20]\n" + "ldr q2, [x10, #0x30]\n" "add v21.4s, v21.4s, v11.4s\n" "add v22.4s, v22.4s, v11.4s\n" "add v16.4s, v16.4s, v12.4s\n" "add v17.4s, v17.4s, v12.4s\n" - "add x23, %x[qp], %[per_layer_mul]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1r { v1.4s }, [x20]\n" "add v18.4s, v18.4s, v12.4s\n" "add v19.4s, v19.4s, v12.4s\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add v23.4s, v23.4s, v13.4s\n" "add v28.4s, v28.4s, v13.4s\n" "add x10, x10, #0x40\n" @@ -1812,100 +1811,100 @@ void a64_hybrid_u8qa_mmla_4x16 ( "add v26.4s, v26.4s, v14.4s\n" "add v27.4s, v27.4s, v14.4s\n" "add v31.4s, v31.4s, v0.4s\n" - "add v20.4s, v20.4s, v1.4s\n" - "add v21.4s, v21.4s, v2.4s\n" - "add v22.4s, v22.4s, v3.4s\n" + "add v20.4s, v20.4s, v4.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v2.4s\n" "add v16.4s, v16.4s, v0.4s\n" - "add v17.4s, v17.4s, v1.4s\n" - "add v18.4s, v18.4s, v2.4s\n" - "add v19.4s, v19.4s, v3.4s\n" + "add v17.4s, v17.4s, v4.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v2.4s\n" "add v23.4s, v23.4s, v0.4s\n" - "add v28.4s, v28.4s, v1.4s\n" - "add v29.4s, v29.4s, v2.4s\n" - "add v30.4s, v30.4s, v3.4s\n" + "add v28.4s, v28.4s, v4.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v2.4s\n" "add v24.4s, v24.4s, v0.4s\n" - "ld1r { v0.4s }, [x23]\n" - "add v25.4s, v25.4s, v1.4s\n" - "add v26.4s, v26.4s, v2.4s\n" - "add v27.4s, v27.4s, v3.4s\n" - "sqrdmulh v31.4s, v31.4s, v4.4s\n" - "sqrdmulh v20.4s, v20.4s, v4.4s\n" - "sqrdmulh v21.4s, v21.4s, v4.4s\n" - "sqrdmulh v22.4s, v22.4s, v4.4s\n" - "sqrdmulh v16.4s, v16.4s, v4.4s\n" - "sqrdmulh v17.4s, v17.4s, v4.4s\n" - "sqrdmulh v18.4s, v18.4s, v4.4s\n" - "sqrdmulh v19.4s, v19.4s, v4.4s\n" - "sqrdmulh v23.4s, v23.4s, v4.4s\n" - "sqrdmulh v28.4s, v28.4s, v4.4s\n" - "sqrdmulh v29.4s, v29.4s, v4.4s\n" - "sqrdmulh v30.4s, v30.4s, v4.4s\n" - "sqrdmulh v24.4s, v24.4s, v4.4s\n" - "sqrdmulh v25.4s, v25.4s, v4.4s\n" - "sqrdmulh v26.4s, v26.4s, v4.4s\n" - "sqrdmulh v27.4s, v27.4s, v4.4s\n" + "ld1r { v0.4s }, [x20]\n" + "add v25.4s, v25.4s, v4.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v2.4s\n" + "sqrdmulh v31.4s, v31.4s, v1.4s\n" + "sqrdmulh v20.4s, v20.4s, v1.4s\n" + "sqrdmulh v21.4s, v21.4s, v1.4s\n" + "sqrdmulh v22.4s, v22.4s, v1.4s\n" + "sqrdmulh v16.4s, v16.4s, v1.4s\n" + "sqrdmulh v17.4s, v17.4s, v1.4s\n" + "sqrdmulh v18.4s, v18.4s, v1.4s\n" + "sqrdmulh v19.4s, v19.4s, v1.4s\n" + "sqrdmulh v23.4s, v23.4s, v1.4s\n" + "sqrdmulh v28.4s, v28.4s, v1.4s\n" + "sqrdmulh v29.4s, v29.4s, v1.4s\n" + "sqrdmulh v30.4s, v30.4s, v1.4s\n" + "sqrdmulh v24.4s, v24.4s, v1.4s\n" + "sqrdmulh v25.4s, v25.4s, v1.4s\n" + "sqrdmulh v26.4s, v26.4s, v1.4s\n" + "sqrdmulh v27.4s, v27.4s, v1.4s\n" "tbz %x[flags], #5, 118f\n" - "and v4.16b, v31.16b, v0.16b\n" - "and v5.16b, v20.16b, v0.16b\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v31.4s, v31.4s, v4.4s\n" - "sqadd v20.4s, v20.4s, v5.4s\n" - "and v6.16b, v21.16b, v0.16b\n" - "and v7.16b, v22.16b, v0.16b\n" - "and v8.16b, v16.16b, v0.16b\n" - "and v9.16b, v17.16b, v0.16b\n" - "and v10.16b, v18.16b, v0.16b\n" - "and v4.16b, v19.16b, v0.16b\n" - "and v5.16b, v23.16b, v0.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" + "and v2.16b, v31.16b, v0.16b\n" + "and v1.16b, v20.16b, v0.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v31.4s, v31.4s, v2.4s\n" + "sqadd v20.4s, v20.4s, v1.4s\n" + "and v7.16b, v21.16b, v0.16b\n" + "and v6.16b, v22.16b, v0.16b\n" + "and v5.16b, v16.16b, v0.16b\n" + "and v4.16b, v17.16b, v0.16b\n" + "and v3.16b, v18.16b, v0.16b\n" + "and v2.16b, v19.16b, v0.16b\n" + "and v1.16b, v23.16b, v0.16b\n" "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" - "sshr v4.4s, v4.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v21.4s, v21.4s, v6.4s\n" - "sqadd v22.4s, v22.4s, v7.4s\n" - "sqadd v16.4s, v16.4s, v8.4s\n" - "sqadd v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v4.4s\n" - "sqadd v23.4s, v23.4s, v5.4s\n" - "and v6.16b, v28.16b, v0.16b\n" - "and v7.16b, v29.16b, v0.16b\n" - "and v8.16b, v30.16b, v0.16b\n" - "and v9.16b, v24.16b, v0.16b\n" - "and v10.16b, v25.16b, v0.16b\n" - "and v4.16b, v26.16b, v0.16b\n" - "and v5.16b, v27.16b, v0.16b\n" "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "sshr v9.4s, v9.4s, #0x1f\n" - "sshr v10.4s, v10.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v21.4s, v21.4s, v7.4s\n" + "sqadd v22.4s, v22.4s, v6.4s\n" + "sqadd v16.4s, v16.4s, v5.4s\n" + "sqadd v17.4s, v17.4s, v4.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v19.4s, v19.4s, v2.4s\n" + "sqadd v23.4s, v23.4s, v1.4s\n" + "and v7.16b, v28.16b, v0.16b\n" + "and v6.16b, v29.16b, v0.16b\n" + "and v5.16b, v30.16b, v0.16b\n" + "and v4.16b, v24.16b, v0.16b\n" + "and v3.16b, v25.16b, v0.16b\n" + "and v2.16b, v26.16b, v0.16b\n" + "and v1.16b, v27.16b, v0.16b\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" "sshr v5.4s, v5.4s, #0x1f\n" - "sqadd v28.4s, v28.4s, v6.4s\n" - "sqadd v29.4s, v29.4s, v7.4s\n" - "sqadd v30.4s, v30.4s, v8.4s\n" - "sqadd v24.4s, v24.4s, v9.4s\n" - "sqadd v25.4s, v25.4s, v10.4s\n" - "sqadd v26.4s, v26.4s, v4.4s\n" - "sqadd v27.4s, v27.4s, v5.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v7.4s\n" + "sqadd v29.4s, v29.4s, v6.4s\n" + "sqadd v30.4s, v30.4s, v5.4s\n" + "sqadd v24.4s, v24.4s, v4.4s\n" + "sqadd v25.4s, v25.4s, v3.4s\n" + "sqadd v26.4s, v26.4s, v2.4s\n" + "sqadd v27.4s, v27.4s, v1.4s\n" "118:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1r { v4.4s }, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1r { v3.4s }, [x20]\n" "srshl v31.4s, v31.4s, v0.4s\n" "srshl v20.4s, v20.4s, v0.4s\n" "srshl v21.4s, v21.4s, v0.4s\n" "srshl v22.4s, v22.4s, v0.4s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1r { v6.4s }, [x23]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1r { v2.4s }, [x20]\n" "srshl v16.4s, v16.4s, v0.4s\n" "srshl v17.4s, v17.4s, v0.4s\n" - "add x23, %x[qp], %[minval]\n" - "ld1r { v5.4s }, [x23]\n" + "add x20, %x[qp], %[minval]\n" + "ld1r { v1.4s }, [x20]\n" "srshl v18.4s, v18.4s, v0.4s\n" "srshl v19.4s, v19.4s, v0.4s\n" "cmp x9, #0x10\n" @@ -1917,163 +1916,163 @@ void a64_hybrid_u8qa_mmla_4x16 ( "srshl v25.4s, v25.4s, v0.4s\n" "srshl v26.4s, v26.4s, v0.4s\n" "srshl v27.4s, v27.4s, v0.4s\n" - "add v31.4s, v31.4s, v4.4s\n" - "add v20.4s, v20.4s, v4.4s\n" - "add v21.4s, v21.4s, v4.4s\n" - "add v22.4s, v22.4s, v4.4s\n" - "add v16.4s, v16.4s, v4.4s\n" - "add v17.4s, v17.4s, v4.4s\n" - "add v18.4s, v18.4s, v4.4s\n" - "add v19.4s, v19.4s, v4.4s\n" - "add v23.4s, v23.4s, v4.4s\n" - "add v28.4s, v28.4s, v4.4s\n" - "add v29.4s, v29.4s, v4.4s\n" - "add v30.4s, v30.4s, v4.4s\n" - "add v24.4s, v24.4s, v4.4s\n" - "add v25.4s, v25.4s, v4.4s\n" - "add v26.4s, v26.4s, v4.4s\n" - "add v27.4s, v27.4s, v4.4s\n" - "smin v31.4s, v31.4s, v6.4s\n" - "smin v20.4s, v20.4s, v6.4s\n" - "smin v21.4s, v21.4s, v6.4s\n" - "smin v22.4s, v22.4s, v6.4s\n" - "smin v16.4s, v16.4s, v6.4s\n" - "smin v17.4s, v17.4s, v6.4s\n" - "smin v18.4s, v18.4s, v6.4s\n" - "smin v19.4s, v19.4s, v6.4s\n" - "smin v23.4s, v23.4s, v6.4s\n" - "smin v28.4s, v28.4s, v6.4s\n" - "smin v29.4s, v29.4s, v6.4s\n" - "smin v30.4s, v30.4s, v6.4s\n" - "smin v24.4s, v24.4s, v6.4s\n" - "smin v25.4s, v25.4s, v6.4s\n" - "smin v26.4s, v26.4s, v6.4s\n" - "smin v27.4s, v27.4s, v6.4s\n" - "smax v31.4s, v31.4s, v5.4s\n" - "smax v20.4s, v20.4s, v5.4s\n" - "smax v21.4s, v21.4s, v5.4s\n" - "smax v22.4s, v22.4s, v5.4s\n" - "smax v16.4s, v16.4s, v5.4s\n" - "smax v17.4s, v17.4s, v5.4s\n" - "smax v18.4s, v18.4s, v5.4s\n" - "smax v19.4s, v19.4s, v5.4s\n" - "smax v23.4s, v23.4s, v5.4s\n" - "smax v28.4s, v28.4s, v5.4s\n" - "smax v29.4s, v29.4s, v5.4s\n" - "smax v30.4s, v30.4s, v5.4s\n" - "smax v24.4s, v24.4s, v5.4s\n" - "smax v25.4s, v25.4s, v5.4s\n" - "smax v26.4s, v26.4s, v5.4s\n" - "smax v27.4s, v27.4s, v5.4s\n" + "add v31.4s, v31.4s, v3.4s\n" + "add v20.4s, v20.4s, v3.4s\n" + "add v21.4s, v21.4s, v3.4s\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v16.4s, v16.4s, v3.4s\n" + "add v17.4s, v17.4s, v3.4s\n" + "add v18.4s, v18.4s, v3.4s\n" + "add v19.4s, v19.4s, v3.4s\n" + "add v23.4s, v23.4s, v3.4s\n" + "add v28.4s, v28.4s, v3.4s\n" + "add v29.4s, v29.4s, v3.4s\n" + "add v30.4s, v30.4s, v3.4s\n" + "add v24.4s, v24.4s, v3.4s\n" + "add v25.4s, v25.4s, v3.4s\n" + "add v26.4s, v26.4s, v3.4s\n" + "add v27.4s, v27.4s, v3.4s\n" + "smin v31.4s, v31.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + "smin v29.4s, v29.4s, v2.4s\n" + "smin v30.4s, v30.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smax v31.4s, v31.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + "smax v29.4s, v29.4s, v1.4s\n" + "smax v30.4s, v30.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" "uzp1 v31.8h, v31.8h, v20.8h\n" "uzp1 v20.8h, v21.8h, v22.8h\n" "uzp1 v16.8h, v16.8h, v17.8h\n" - "uzp1 v17.8h, v18.8h, v19.8h\n" + "uzp1 v19.8h, v18.8h, v19.8h\n" "uzp1 v23.8h, v23.8h, v28.8h\n" - "uzp1 v28.8h, v29.8h, v30.8h\n" + "uzp1 v18.8h, v29.8h, v30.8h\n" "uzp1 v24.8h, v24.8h, v25.8h\n" - "uzp1 v25.8h, v26.8h, v27.8h\n" + "uzp1 v17.8h, v26.8h, v27.8h\n" "uzp1 v31.16b, v31.16b, v20.16b\n" - "uzp1 v16.16b, v16.16b, v17.16b\n" - "uzp1 v23.16b, v23.16b, v28.16b\n" - "uzp1 v24.16b, v24.16b, v25.16b\n" + "uzp1 v16.16b, v16.16b, v19.16b\n" + "uzp1 v23.16b, v23.16b, v18.16b\n" + "uzp1 v24.16b, v24.16b, v17.16b\n" "bge 127f\n" "tbz x9, #3, 122f\n" "str d31, [x27], #0x8\n" - "str d16, [x22], #0x8\n" - "str d23, [x21], #0x8\n" - "str d24, [x20], #0x8\n" + "str d16, [x23], #0x8\n" + "str d23, [x22], #0x8\n" + "str d24, [x21], #0x8\n" "tbz x9, #2, 120f\n" "st1 { v31.s }[2], [x27], #0x4\n" - "st1 { v16.s }[2], [x22], #0x4\n" - "st1 { v23.s }[2], [x21], #0x4\n" - "st1 { v24.s }[2], [x20], #0x4\n" + "st1 { v16.s }[2], [x23], #0x4\n" + "st1 { v23.s }[2], [x22], #0x4\n" + "st1 { v24.s }[2], [x21], #0x4\n" "tbz x9, #1, 119f\n" "st1 { v31.h }[6], [x27], #0x2\n" - "st1 { v16.h }[6], [x22], #0x2\n" - "st1 { v23.h }[6], [x21], #0x2\n" - "st1 { v24.h }[6], [x20], #0x2\n" + "st1 { v16.h }[6], [x23], #0x2\n" + "st1 { v23.h }[6], [x22], #0x2\n" + "st1 { v24.h }[6], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[14], [x27]\n" - "st1 { v16.b }[14], [x22]\n" - "st1 { v23.b }[14], [x21]\n" - "st1 { v24.b }[14], [x20]\n" + "st1 { v16.b }[14], [x23]\n" + "st1 { v23.b }[14], [x22]\n" + "st1 { v24.b }[14], [x21]\n" "b 126f\n" "119:" // Height 4: Partial direct writeback: partial_1_12 "tbz x9, #0, 126f\n" "st1 { v31.b }[12], [x27]\n" - "st1 { v16.b }[12], [x22]\n" - "st1 { v23.b }[12], [x21]\n" - "st1 { v24.b }[12], [x20]\n" + "st1 { v16.b }[12], [x23]\n" + "st1 { v23.b }[12], [x22]\n" + "st1 { v24.b }[12], [x21]\n" "b 126f\n" "120:" // Height 4: Partial direct writeback: partial_2_8 "tbz x9, #1, 121f\n" "st1 { v31.h }[4], [x27], #0x2\n" - "st1 { v16.h }[4], [x22], #0x2\n" - "st1 { v23.h }[4], [x21], #0x2\n" - "st1 { v24.h }[4], [x20], #0x2\n" + "st1 { v16.h }[4], [x23], #0x2\n" + "st1 { v23.h }[4], [x22], #0x2\n" + "st1 { v24.h }[4], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[10], [x27]\n" - "st1 { v16.b }[10], [x22]\n" - "st1 { v23.b }[10], [x21]\n" - "st1 { v24.b }[10], [x20]\n" + "st1 { v16.b }[10], [x23]\n" + "st1 { v23.b }[10], [x22]\n" + "st1 { v24.b }[10], [x21]\n" "b 126f\n" "121:" // Height 4: Partial direct writeback: partial_1_8 "tbz x9, #0, 126f\n" "st1 { v31.b }[8], [x27]\n" - "st1 { v16.b }[8], [x22]\n" - "st1 { v23.b }[8], [x21]\n" - "st1 { v24.b }[8], [x20]\n" + "st1 { v16.b }[8], [x23]\n" + "st1 { v23.b }[8], [x22]\n" + "st1 { v24.b }[8], [x21]\n" "b 126f\n" "122:" // Height 4: Partial direct writeback: partial_4_0 "tbz x9, #2, 124f\n" "str s31, [x27], #0x4\n" - "str s16, [x22], #0x4\n" - "str s23, [x21], #0x4\n" - "str s24, [x20], #0x4\n" + "str s16, [x23], #0x4\n" + "str s23, [x22], #0x4\n" + "str s24, [x21], #0x4\n" "tbz x9, #1, 123f\n" "st1 { v31.h }[2], [x27], #0x2\n" - "st1 { v16.h }[2], [x22], #0x2\n" - "st1 { v23.h }[2], [x21], #0x2\n" - "st1 { v24.h }[2], [x20], #0x2\n" + "st1 { v16.h }[2], [x23], #0x2\n" + "st1 { v23.h }[2], [x22], #0x2\n" + "st1 { v24.h }[2], [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[6], [x27]\n" - "st1 { v16.b }[6], [x22]\n" - "st1 { v23.b }[6], [x21]\n" - "st1 { v24.b }[6], [x20]\n" + "st1 { v16.b }[6], [x23]\n" + "st1 { v23.b }[6], [x22]\n" + "st1 { v24.b }[6], [x21]\n" "b 126f\n" "123:" // Height 4: Partial direct writeback: partial_1_4 "tbz x9, #0, 126f\n" "st1 { v31.b }[4], [x27]\n" - "st1 { v16.b }[4], [x22]\n" - "st1 { v23.b }[4], [x21]\n" - "st1 { v24.b }[4], [x20]\n" + "st1 { v16.b }[4], [x23]\n" + "st1 { v23.b }[4], [x22]\n" + "st1 { v24.b }[4], [x21]\n" "b 126f\n" "124:" // Height 4: Partial direct writeback: partial_2_0 "tbz x9, #1, 125f\n" "str h31, [x27], #0x2\n" - "str h16, [x22], #0x2\n" - "str h23, [x21], #0x2\n" - "str h24, [x20], #0x2\n" + "str h16, [x23], #0x2\n" + "str h23, [x22], #0x2\n" + "str h24, [x21], #0x2\n" "tbz x9, #0, 126f\n" "st1 { v31.b }[2], [x27]\n" - "st1 { v16.b }[2], [x22]\n" - "st1 { v23.b }[2], [x21]\n" - "st1 { v24.b }[2], [x20]\n" + "st1 { v16.b }[2], [x23]\n" + "st1 { v23.b }[2], [x22]\n" + "st1 { v24.b }[2], [x21]\n" "b 126f\n" "125:" // Height 4: Partial direct writeback: partial_1_0 "str b31, [x27, #0x0]\n" - "str b16, [x22, #0x0]\n" - "str b23, [x21, #0x0]\n" - "str b24, [x20, #0x0]\n" + "str b16, [x23, #0x0]\n" + "str b23, [x22, #0x0]\n" + "str b24, [x21, #0x0]\n" "126:" // Height 4: Partial direct writeback: Done "b 128f\n" "127:" // Height 4: Full writeback "str q31, [x27, #0x0]\n" "add x27, x27, #0x10\n" - "str q16, [x22, #0x0]\n" - "str q23, [x21, #0x0]\n" - "str q24, [x20, #0x0]\n" + "str q16, [x23, #0x0]\n" + "str q23, [x22, #0x0]\n" + "str q24, [x21, #0x0]\n" "128:" // Height 4: Writeback done "subs x9, x9, #0x10\n" "bgt 98b\n" @@ -2089,7 +2088,6 @@ void a64_hybrid_u8qa_mmla_4x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "130:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp index ce96c1b28f..38bb7c646d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -121,5 +121,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp index 705f6525b6..7f0fad7fa7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp @@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 171f\n" @@ -165,11 +164,11 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 15f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" "cbnz x15, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" @@ -186,129 +185,129 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "blt 18f\n" "17:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x16, #0x20]\n" - "ldr x12, [x16, #0x28]\n" + "ldr d17, [x16, #0x20]\n" + "ldr x20, [x16, #0x28]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x38]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x78]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xb8]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xf8]\n" - "mov v7.d[1], x11\n" + "ldr d16, [x16, #0x30]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x38]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + "ldr d17, [x16, #0x40]\n" + "ldr x20, [x16, #0x48]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + "ldr d16, [x16, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x16, #0x60]\n" + "ldr x20, [x16, #0x68]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x16, #0x70]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x78]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + "ldr d17, [x16, #0x80]\n" + "ldr x20, [x16, #0x88]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + "ldr d16, [x16, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x16, #0xa0]\n" + "ldr x20, [x16, #0xa8]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x16, #0xb0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xb8]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + "ldr d17, [x16, #0xc0]\n" + "ldr x20, [x16, #0xc8]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + "ldr d16, [x16, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + "ldr d17, [x16, #0xe0]\n" + "ldr x20, [x16, #0xe8]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + "ldr d16, [x16, #0xf0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xf8]\n" + "mov v16.d[1], x20\n" "add x13, x13, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" "ldr d6, [x16, #0x0]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr x20, [x16, #0x8]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x18]\n" - "mov v0.d[1], x10\n" - "mov v7.d[1], x11\n" + "ldr x21, [x13, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v0.d[1], x21\n" + "mov v7.d[1], x20\n" "prfm pldl1keep, [x13, #0x80]\n" "bge 17b\n" "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q17, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x40]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x50]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x16, #0x60]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x16, #0x70]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x16, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x16, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x16, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x16, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x16, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x16, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x16, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x16, #0xf0]\n" "add x13, x13, #0x10\n" "sub x14, x14, #0x10\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" "19:" // Height 1: Multiply loop: Main loop skip "cbz x14, 24f\n" "cmp x14, #0x4\n" "blt 21f\n" "20:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s18, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q16, [x16, #0x0]\n" + ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n" + "ldr q17, [x16, #0x20]\n" "cmp x14, #0x4\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n" "add x16, x16, #0x40\n" "bge 20b\n" "21:" // Height 1: Multiply loop: Skip odd blocks @@ -321,14 +320,14 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x0]\n" + ".inst 0x6f80e208 // udot v8.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x20]\n" + ".inst 0x6f80e20a // udot v10.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -499,226 +498,226 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" "cbnz x15, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" + "add x12, x12, x20\n" "b 50f\n" "49:" // Height 2: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" + "add x12, x13, x21\n" "50:" // Height 2: input setup done "cmp x14, #0x10\n" "blt 53f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" + "ldr q1, [x12, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 52f\n" "51:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d17, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x58]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x98]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0xd8]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v6.d[1], x12\n" + "ldr d16, [x16, #0x30]\n" + "mov v17.d[1], x21\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" + "ldr d17, [x16, #0x40]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + "ldr x20, [x16, #0x48]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" + "ldr d16, [x16, #0x50]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x58]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x16, #0x60]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x16, #0x70]\n" + "mov v17.d[1], x21\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n" + "ldr d17, [x16, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + "ldr x20, [x16, #0x88]\n" + ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n" + "ldr d16, [x16, #0x90]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0x98]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x16, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x16, #0xb0]\n" + "mov v17.d[1], x21\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + "mov v16.d[1], x20\n" + ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n" + "ldr d17, [x16, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + "ldr x20, [x16, #0xc8]\n" + ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n" + "ldr d16, [x16, #0xd0]\n" + "mov v17.d[1], x20\n" + "ldr x20, [x16, #0xd8]\n" + "mov v16.d[1], x20\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n" + "ldr d17, [x16, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n" + "ldr d16, [x16, #0xf0]\n" + "mov v17.d[1], x21\n" "add x13, x13, #0x10\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" + "mov v16.d[1], x20\n" + "add x12, x12, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n" "ldr d6, [x16, #0x0]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x10, [x13, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x28, [x9, #0x8]\n" - "mov v0.d[1], x10\n" - "ldr x11, [x16, #0x18]\n" - "mov v1.d[1], x28\n" + "ldr x20, [x13, #0x8]\n" + "mov v6.d[1], x21\n" + "ldr x21, [x12, #0x8]\n" + "mov v0.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v1.d[1], x21\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v7.d[1], x11\n" - "prfm pldl1keep, [x9, #0x80]\n" + "mov v7.d[1], x20\n" + "prfm pldl1keep, [x12, #0x80]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q17, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" "sub x14, x14, #0x10\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x16, #0x40]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x16, #0x50]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x16, #0x60]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x16, #0x70]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x16, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x16, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x16, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x16, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x16, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x16, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x16, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x16, #0xf0]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x14, 58f\n" "cmp x14, #0x4\n" "blt 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s19, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s18, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x0]\n" + ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x16, #0x20]\n" + ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n" + ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n" "bge 54b\n" "55:" // Height 2: Multiply loop: Skip odd blocks "cbz x14, 58f\n" "tbz x14, #1, 56f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" + "ldr h1, [x12], #0x2\n" "tbz x14, #0, 57f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" + "ld1 { v1.b }[2], [x12]\n" "b 57f\n" "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" + "ldr b1, [x12, #0x0]\n" "57:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q17, [x16, #0x0]\n" + ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n" + "ldr q16, [x16, #0x10]\n" + ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x16, #0x20]\n" + ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x16, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" "58:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -936,281 +935,281 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 83f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" "cbnz x15, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" "b 84f\n" "83:" // Height 3: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" "84:" // Height 3: input setup done "cmp x14, #0x10\n" "blt 87f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 86f\n" "85:" // Height 3: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d21, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v21.d[1], x21\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" + "ldr d20, [x16, #0x30]\n" + "mov v20.d[1], x20\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + "ldr d21, [x16, #0x40]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" + "mov v21.d[1], x21\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" + "ldr d20, [x16, #0x50]\n" + "mov v20.d[1], x20\n" + ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x16, #0x60]\n" + ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x16, #0x70]\n" + "mov v20.d[1], x20\n" + ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n" + "ldr d21, [x16, #0x80]\n" + ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n" + "mov v21.d[1], x21\n" + ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n" + "ldr d20, [x16, #0x90]\n" + "mov v20.d[1], x20\n" + ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x16, #0xa0]\n" + ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x16, #0xb0]\n" + "mov v20.d[1], x20\n" + ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n" + "ldr d21, [x16, #0xc0]\n" + ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n" + "mov v21.d[1], x21\n" + ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n" + "ldr d20, [x16, #0xd0]\n" + "mov v20.d[1], x20\n" + ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n" + "ldr d21, [x16, #0xe0]\n" + ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n" + "mov v21.d[1], x21\n" + ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n" "add x13, x13, #0x10\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" - "add x9, x9, #0x10\n" - "add x27, x27, #0x10\n" + ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n" + "ldr d20, [x16, #0xf0]\n" + "mov v20.d[1], x20\n" + "add x12, x12, #0x10\n" + "add x11, x11, #0x10\n" "add x16, x16, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n" + "ldr x20, [x16, #0x8]\n" + ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n" + "ldr x23, [x13, #0x8]\n" + ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" + ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + "ldr x22, [x12, #0x8]\n" + ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" "sub x14, x14, #0x10\n" "ldr d7, [x16, #0x10]\n" "cmp x14, #0x20\n" - "ldr x26, [x27, #0x8]\n" - "mov v6.d[1], x12\n" - "ldr x11, [x16, #0x18]\n" - "mov v0.d[1], x10\n" + "ldr x21, [x11, #0x8]\n" + "mov v6.d[1], x20\n" + "ldr x20, [x16, #0x18]\n" + "mov v0.d[1], x23\n" "prfm pldl1keep, [x13, #0x80]\n" - "mov v1.d[1], x28\n" - "prfm pldl1keep, [x9, #0x80]\n" - "mov v2.d[1], x26\n" - "prfm pldl1keep, [x27, #0x80]\n" - "mov v7.d[1], x11\n" + "mov v1.d[1], x22\n" + "prfm pldl1keep, [x12, #0x80]\n" + "mov v2.d[1], x21\n" + "prfm pldl1keep, [x11, #0x80]\n" + "mov v7.d[1], x20\n" "bge 85b\n" "86:" // Height 3: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q21, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x16, #0x40]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x16, #0x50]\n" + ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x16, #0x60]\n" + ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x16, #0x70]\n" + ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x16, #0x80]\n" + ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x16, #0x90]\n" + ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x16, #0xa0]\n" + ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x16, #0xb0]\n" + ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x16, #0xc0]\n" + ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x16, #0xd0]\n" + ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x16, #0xe0]\n" + ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x16, #0xf0]\n" + ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n" "87:" // Height 3: Multiply loop: Main loop skip "cbz x14, 92f\n" "cmp x14, #0x4\n" "blt 89f\n" "88:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s24, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s23, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s22, [x11], #0x4\n" + "ldr q21, [x16, #0x0]\n" + ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n" + "ldr q20, [x16, #0x10]\n" + ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n" + ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x16, #0x20]\n" + ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n" "bge 88b\n" "89:" // Height 3: Multiply loop: Skip odd blocks "cbz x14, 92f\n" "tbz x14, #1, 90f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" "tbz x14, #0, 91f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" "b 91f\n" "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" "91:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q21, [x16, #0x0]\n" + ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n" + "ldr q20, [x16, #0x10]\n" + ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x16, #0x20]\n" + ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x16, #0x30]\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" "92:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -1475,336 +1474,336 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 117f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" "cbnz x15, 118f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "b 118f\n" "117:" // Height 4: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" "118:" // Height 4: input setup done "cmp x14, #0x10\n" "blt 121f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 120f\n" "119:" // Height 4: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x16, #0x20]\n" + "ldr d25, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v25.d[1], x21\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" + "ldr d24, [x16, #0x30]\n" + "mov v24.d[1], x20\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + "add x11, x11, #0x10\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + "ldr d25, [x16, #0x40]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + "mov v25.d[1], x21\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" + "ldr d24, [x16, #0x50]\n" + "mov v24.d[1], x20\n" + ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n" + "ldr x25, [x13, #0x8]\n" + ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x16, #0x60]\n" + ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n" + "ldr x24, [x12, #0x8]\n" + ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x16, #0x70]\n" + "mov v24.d[1], x20\n" + ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n" + "ldr x23, [x11, #0x8]\n" + ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n" + "ldr d25, [x16, #0x80]\n" + ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n" + "mov v25.d[1], x21\n" + ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n" + "ldr x22, [x10, #0x8]\n" + ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n" + "ldr d24, [x16, #0x90]\n" + "mov v24.d[1], x20\n" + ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n" "sub x14, x14, #0x10\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" + ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x16, #0xa0]\n" + ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n" "cmp x14, #0x20\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" + ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x16, #0xb0]\n" + "mov v24.d[1], x20\n" + ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n" "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n" + "ldr d25, [x16, #0xc0]\n" + ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n" + "mov v25.d[1], x21\n" + ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n" + "ldr d24, [x16, #0xd0]\n" + "mov v24.d[1], x20\n" + ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n" + "ldr d25, [x16, #0xe0]\n" + ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n" + "mov v25.d[1], x21\n" + ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n" + "ldr d24, [x16, #0xf0]\n" + "mov v24.d[1], x20\n" "add x16, x16, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n" + "ldr x20, [x16, #0x18]\n" + ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" + ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n" + "ldr d3, [x10, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" - "mov v7.d[1], x11\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x25\n" + "mov v1.d[1], x24\n" + "mov v2.d[1], x23\n" + "mov v3.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 119b\n" "120:" // Height 4: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q25, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x16, #0x40]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x16, #0x50]\n" + ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x16, #0x60]\n" + ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x16, #0x70]\n" + ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x16, #0x80]\n" + ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x16, #0x90]\n" + ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x16, #0xa0]\n" + ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x16, #0xb0]\n" + ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x16, #0xc0]\n" + ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x16, #0xd0]\n" + ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x16, #0xe0]\n" + ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x16, #0xf0]\n" + ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n" "121:" // Height 4: Multiply loop: Main loop skip "cbz x14, 126f\n" "cmp x14, #0x4\n" "blt 123f\n" "122:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s29, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s28, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s27, [x11], #0x4\n" + "ldr s26, [x10], #0x4\n" + "ldr q25, [x16, #0x0]\n" + ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n" + "ldr q24, [x16, #0x10]\n" + ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x16, #0x20]\n" + ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n" "bge 122b\n" "123:" // Height 4: Multiply loop: Skip odd blocks "cbz x14, 126f\n" "tbz x14, #1, 124f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" "tbz x14, #0, 125f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" "b 125f\n" "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" "125:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q25, [x16, #0x0]\n" + ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n" + "ldr q24, [x16, #0x10]\n" + ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x16, #0x20]\n" + ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x16, #0x30]\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" "126:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -2108,399 +2107,399 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "149:" // Height 5: setup done - "mov x15, #0x0\n" - "150:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 151f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "cbnz x15, 152f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x13, x13, x20\n" - "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "b 152f\n" - "151:" // Height 5: setup direct input - "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "152:" // Height 5: input setup done - "cmp x14, #0x10\n" - "blt 155f\n" - "ldr q0, [x13, #0x0]\n" - "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q6, [x16, #0x0]\n" - "ldr q7, [x16, #0x10]\n" - "blt 154f\n" - "153:" // Height 5: Multiply loop: Main loop head - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x13, x13, #0x10\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x10, [x13, #0x8]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr d6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x28, [x9, #0x8]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr x26, [x27, #0x8]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr x24, [x25, #0x8]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr x22, [x23, #0x8]\n" - ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "sub x14, x14, #0x10\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "cmp x14, #0x20\n" - ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "prfm pldl1keep, [x13, #0x80]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" - ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr d6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr d6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr d6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x15, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w14, [x20, x15, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "cbnz x15, 152f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x13, x13, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" + "add x9, x9, x20\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x13, %x[input_ptr]\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "152:" // Height 5: input setup done + "cmp x14, #0x10\n" + "blt 155f\n" + "ldr q0, [x13, #0x0]\n" + "cmp x14, #0x20\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q6, [x16, #0x0]\n" + "ldr q7, [x16, #0x10]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr x21, [x16, #0x28]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x38]\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + "add x13, x13, #0x10\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "add x12, x12, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr d29, [x16, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x48]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "add x11, x11, #0x10\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + "add x10, x10, #0x10\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr d28, [x16, #0x30]\n" + "mov v28.d[1], x20\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + "add x9, x9, #0x10\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" + "ldr x26, [x13, #0x8]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + "ldr d29, [x16, #0x40]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + "mov v29.d[1], x21\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + "ldr x21, [x16, #0x68]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + "ldr x25, [x12, #0x8]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + "ldr x24, [x11, #0x8]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" + "ldr d28, [x16, #0x50]\n" + "mov v28.d[1], x20\n" + ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x78]\n" + ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n" + "ldr x23, [x10, #0x8]\n" + ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n" + "ldr x22, [x9, #0x8]\n" + ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x16, #0x60]\n" + ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x16, #0x88]\n" + ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n" + "sub x14, x14, #0x10\n" + ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n" + "cmp x14, #0x20\n" + ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x16, #0x70]\n" + "mov v28.d[1], x20\n" + ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n" + "ldr x20, [x16, #0x98]\n" + ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n" + "prfm pldl1keep, [x13, #0x80]\n" + ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n" + "prfm pldl1keep, [x12, #0x80]\n" + ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n" + "ldr d29, [x16, #0x80]\n" + ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n" + "mov v29.d[1], x21\n" + ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n" + "ldr x21, [x16, #0xa8]\n" + ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n" + "ldr d28, [x16, #0x90]\n" + "mov v28.d[1], x20\n" + ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x16, #0xa0]\n" + ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xc8]\n" + ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x16, #0xb0]\n" + "mov v28.d[1], x20\n" + ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n" + "ldr x20, [x16, #0xd8]\n" + ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n" + "ldr d29, [x16, #0xc0]\n" + ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n" + "mov v29.d[1], x21\n" + ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n" + "ldr x21, [x16, #0xe8]\n" + ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n" + "ldr d28, [x16, #0xd0]\n" + "mov v28.d[1], x20\n" + ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x16, #0xf8]\n" + ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n" + "ldr d29, [x16, #0xe0]\n" + ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n" + "mov v29.d[1], x21\n" + ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n" + "ldr d28, [x16, #0xf0]\n" + "mov v28.d[1], x20\n" "add x16, x16, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n" + "ldr x21, [x16, #0x8]\n" + ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n" + "ldr x20, [x16, #0x18]\n" + ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n" "ldr d6, [x16, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" - ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" + ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n" + "ldr d1, [x12, #0x0]\n" + ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n" + "ldr d2, [x11, #0x0]\n" + ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n" + "ldr d3, [x10, #0x0]\n" + ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n" + "ldr d4, [x9, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" - "mov v3.d[1], x24\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x26\n" + "mov v1.d[1], x25\n" + "mov v2.d[1], x24\n" + "mov v3.d[1], x23\n" "mov v4.d[1], x22\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "bge 153b\n" "154:" // Height 5: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" + "ldr q29, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x16, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x16, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x16, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x16, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x16, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x16, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x16, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x16, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x16, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x16, #0xf0]\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" + "prfm pldl1keep, [x11, #0x80]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + "prfm pldl1keep, [x10, #0x80]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + "prfm pldl1keep, [x9, #0x80]\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x16, #0x40]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x16, #0x50]\n" + ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x16, #0x60]\n" + ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x16, #0x70]\n" + ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x16, #0x80]\n" + ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x16, #0x90]\n" + ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x16, #0xa0]\n" + ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x16, #0xb0]\n" + ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x16, #0xc0]\n" + ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x16, #0xd0]\n" + ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x16, #0xe0]\n" + ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x16, #0xf0]\n" + ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n" "add x16, x16, #0x100\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n" "155:" // Height 5: Multiply loop: Main loop skip "cbz x14, 160f\n" "cmp x14, #0x4\n" "blt 157f\n" "156:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s2, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s1, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s0, [x11], #0x4\n" + "ldr s31, [x10], #0x4\n" + "ldr s30, [x9], #0x4\n" + "ldr q29, [x16, #0x0]\n" + ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n" + "ldr q28, [x16, #0x10]\n" + ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x16, #0x20]\n" + ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n" "bge 156b\n" "157:" // Height 5: Multiply loop: Skip odd blocks "cbz x14, 160f\n" "tbz x14, #1, 158f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" + "ldr h4, [x9], #0x2\n" "tbz x14, #0, 159f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" + "ld1 { v4.b }[2], [x9]\n" "b 159f\n" "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" + "ldr b4, [x9, #0x0]\n" "159:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q29, [x16, #0x0]\n" + ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n" + "ldr q28, [x16, #0x10]\n" + ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x16, #0x20]\n" + ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x16, #0x30]\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" "160:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -2862,98 +2861,98 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w14, [x20, x15, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 185f\n" - "ldr x21, [%x[input_ptr], x15, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x13, [x21, #0x0]\n" - "ldr x9, [x21, #0x8]\n" - "ldr x27, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x23, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x15, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x13, [x20, #0x0]\n" + "ldr x12, [x20, #0x8]\n" + "ldr x11, [x20, #0x10]\n" + "ldr x10, [x20, #0x18]\n" + "ldr x9, [x20, #0x20]\n" + "ldr x28, [x20, #0x28]\n" "cbnz x15, 186f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x13, x13, x20\n" + "add x12, x12, x20\n" + "add x11, x11, x20\n" + "add x10, x10, x20\n" "add x9, x9, x20\n" - "add x27, x27, x20\n" - "add x25, x25, x20\n" - "add x23, x23, x20\n" - "add x21, x21, x20\n" + "add x28, x28, x20\n" "b 186f\n" "185:" // Height 6: setup direct input "mov x13, %x[input_ptr]\n" - "add x9, x13, x20\n" - "add x27, x9, x20\n" - "add x25, x27, x20\n" - "add x23, x25, x20\n" - "add x21, x23, x20\n" + "add x12, x13, x21\n" + "add x11, x12, x21\n" + "add x10, x11, x21\n" + "add x9, x10, x21\n" + "add x28, x9, x21\n" "186:" // Height 6: input setup done "cmp x14, #0x10\n" "blt 189f\n" "ldr q0, [x13, #0x0]\n" "cmp x14, #0x20\n" - "ldr q1, [x9, #0x0]\n" - "ldr q2, [x27, #0x0]\n" - "ldr q3, [x25, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x21, #0x0]\n" + "ldr q1, [x12, #0x0]\n" + "ldr q2, [x11, #0x0]\n" + "ldr q3, [x10, #0x0]\n" + "ldr q4, [x9, #0x0]\n" + "ldr q5, [x28, #0x0]\n" "ldr q6, [x16, #0x0]\n" "ldr q7, [x16, #0x10]\n" "blt 188f\n" "187:" // Height 6: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr x12, [x16, #0x28]\n" + "ldr x21, [x16, #0x28]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x38]\n" + "ldr x20, [x16, #0x38]\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x48]\n" + "ldr x21, [x16, #0x48]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x16, #0x30]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr x11, [x16, #0x58]\n" + "ldr x20, [x16, #0x58]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr x10, [x13, #0x8]\n" + "ldr x27, [x13, #0x8]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr x28, [x9, #0x8]\n" + "ldr x26, [x12, #0x8]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr x26, [x27, #0x8]\n" + "ldr x25, [x11, #0x8]\n" ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" "ldr d6, [x16, #0x40]\n" ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr x12, [x16, #0x68]\n" + "ldr x21, [x16, #0x68]\n" ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr x24, [x25, #0x8]\n" + "ldr x24, [x10, #0x8]\n" ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr x22, [x23, #0x8]\n" + "ldr x23, [x9, #0x8]\n" ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr x20, [x21, #0x8]\n" + "ldr x22, [x28, #0x8]\n" ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" "ldr d7, [x16, #0x50]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x78]\n" + "ldr x20, [x16, #0x78]\n" ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" "sub x14, x14, #0x10\n" ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" @@ -2963,96 +2962,96 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x16, #0x60]\n" ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0x88]\n" + "ldr x21, [x16, #0x88]\n" ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x16, #0x70]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr x11, [x16, #0x98]\n" + "ldr x20, [x16, #0x98]\n" ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n" "ldr d6, [x16, #0x80]\n" ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr x12, [x16, #0xa8]\n" + "ldr x21, [x16, #0xa8]\n" ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n" "ldr d7, [x16, #0x90]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xb8]\n" + "ldr x20, [x16, #0xb8]\n" ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x16, #0xa0]\n" ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xc8]\n" + "ldr x21, [x16, #0xc8]\n" ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x16, #0xb0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr x11, [x16, #0xd8]\n" + "ldr x20, [x16, #0xd8]\n" ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n" "ldr d6, [x16, #0xc0]\n" ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr x12, [x16, #0xe8]\n" + "ldr x21, [x16, #0xe8]\n" ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n" "ldr d7, [x16, #0xd0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0xf8]\n" + "ldr x20, [x16, #0xf8]\n" ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n" "ldr d6, [x16, #0xe0]\n" ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "mov v6.d[1], x12\n" + "mov v6.d[1], x21\n" ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n" "ldr d7, [x16, #0xf0]\n" - "mov v7.d[1], x11\n" + "mov v7.d[1], x20\n" "add x16, x16, #0x100\n" ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - "ldr x12, [x16, #0x8]\n" + "ldr x21, [x16, #0x8]\n" ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - "ldr x11, [x16, #0x18]\n" + "ldr x20, [x16, #0x18]\n" ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" @@ -3061,56 +3060,56 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" "ldr d0, [x13, #0x0]\n" ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - "ldr d1, [x9, #0x0]\n" + "ldr d1, [x12, #0x0]\n" ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - "ldr d2, [x27, #0x0]\n" + "ldr d2, [x11, #0x0]\n" ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - "ldr d3, [x25, #0x0]\n" + "ldr d3, [x10, #0x0]\n" ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" - "ldr d4, [x23, #0x0]\n" + "ldr d4, [x9, #0x0]\n" ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n" - "ldr d5, [x21, #0x0]\n" + "ldr d5, [x28, #0x0]\n" "ldr d7, [x16, #0x10]\n" - "mov v6.d[1], x12\n" - "mov v0.d[1], x10\n" - "mov v1.d[1], x28\n" - "mov v2.d[1], x26\n" + "mov v6.d[1], x21\n" + "mov v0.d[1], x27\n" + "mov v1.d[1], x26\n" + "mov v2.d[1], x25\n" "mov v3.d[1], x24\n" - "mov v4.d[1], x22\n" - "mov v5.d[1], x20\n" - "mov v7.d[1], x11\n" + "mov v4.d[1], x23\n" + "mov v5.d[1], x22\n" + "mov v7.d[1], x20\n" "bge 187b\n" "188:" // Height 6: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" "add x13, x13, #0x10\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "add x9, x9, #0x10\n" + "add x12, x12, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "add x27, x27, #0x10\n" + "add x11, x11, #0x10\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" + "add x10, x10, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "add x23, x23, #0x10\n" + "add x9, x9, #0x10\n" ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" "ldr q6, [x16, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x21, x21, #0x10\n" + "add x28, x28, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" "sub x14, x14, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" "prfm pldl1keep, [x13, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "prfm pldl1keep, [x9, #0x80]\n" + "prfm pldl1keep, [x12, #0x80]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "prfm pldl1keep, [x27, #0x80]\n" + "prfm pldl1keep, [x11, #0x80]\n" ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" "ldr q7, [x16, #0x30]\n" ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x10, #0x80]\n" ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x9, #0x80]\n" ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "prfm pldl1keep, [x21, #0x80]\n" + "prfm pldl1keep, [x28, #0x80]\n" ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" @@ -3210,98 +3209,98 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "cmp x14, #0x4\n" "blt 191f\n" "190:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x13], #0x4\n" + "ldr s7, [x13], #0x4\n" "sub x14, x14, #0x4\n" - "ldr s1, [x9], #0x4\n" + "ldr s6, [x12], #0x4\n" "cmp x14, #0x4\n" - "ldr s2, [x27], #0x4\n" - "ldr s3, [x25], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr s5, [x11], #0x4\n" + "ldr s4, [x10], #0x4\n" + "ldr s3, [x9], #0x4\n" + "ldr s2, [x28], #0x4\n" + "ldr q1, [x16, #0x0]\n" + ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n" + "ldr q0, [x16, #0x10]\n" + ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x16, #0x20]\n" + ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x16, #0x30]\n" + ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n" "bge 190b\n" "191:" // Height 6: Multiply loop: Skip odd blocks "cbz x14, 194f\n" "tbz x14, #1, 192f\n" "ldr h0, [x13], #0x2\n" - "ldr h1, [x9], #0x2\n" - "ldr h2, [x27], #0x2\n" - "ldr h3, [x25], #0x2\n" - "ldr h4, [x23], #0x2\n" - "ldr h5, [x21], #0x2\n" + "ldr h1, [x12], #0x2\n" + "ldr h2, [x11], #0x2\n" + "ldr h3, [x10], #0x2\n" + "ldr h4, [x9], #0x2\n" + "ldr h5, [x28], #0x2\n" "tbz x14, #0, 193f\n" "ld1 { v0.b }[2], [x13]\n" - "ld1 { v1.b }[2], [x9]\n" - "ld1 { v2.b }[2], [x27]\n" - "ld1 { v3.b }[2], [x25]\n" - "ld1 { v4.b }[2], [x23]\n" - "ld1 { v5.b }[2], [x21]\n" + "ld1 { v1.b }[2], [x12]\n" + "ld1 { v2.b }[2], [x11]\n" + "ld1 { v3.b }[2], [x10]\n" + "ld1 { v4.b }[2], [x9]\n" + "ld1 { v5.b }[2], [x28]\n" "b 193f\n" "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x13, #0x0]\n" - "ldr b1, [x9, #0x0]\n" - "ldr b2, [x27, #0x0]\n" - "ldr b3, [x25, #0x0]\n" - "ldr b4, [x23, #0x0]\n" - "ldr b5, [x21, #0x0]\n" + "ldr b1, [x12, #0x0]\n" + "ldr b2, [x11, #0x0]\n" + "ldr b3, [x10, #0x0]\n" + "ldr b4, [x9, #0x0]\n" + "ldr b5, [x28, #0x0]\n" "193:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x16, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q7, [x16, #0x10]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x16, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x16, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" + "ldr q7, [x16, #0x0]\n" + ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" + "ldr q6, [x16, #0x10]\n" + ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x16, #0x20]\n" + ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x16, #0x30]\n" + ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n" "add x16, x16, #0x40\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n" "194:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x15, x15, #0x1\n" @@ -3488,7 +3487,6 @@ void a64_hybrid_u8u32_dot_6x16_a55 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "206:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp index 38131cfd4b..849c680843 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp @@ -77,7 +77,6 @@ void a64_hybrid_u8u32_dot_6x16 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 171f\n" @@ -165,11 +164,11 @@ void a64_hybrid_u8u32_dot_6x16 ( "14:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 15f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -186,37 +185,37 @@ void a64_hybrid_u8u32_dot_6x16 ( "blt 18f\n" "17:" // Height 1: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" "cmp x27, #0x20\n" "add x10, x10, #0x100\n" @@ -226,37 +225,37 @@ void a64_hybrid_u8u32_dot_6x16 ( "bge 17b\n" "18:" // Height 1: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x40]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x50]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x60]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "19:" // Height 1: Multiply loop: Main loop skip @@ -264,17 +263,17 @@ void a64_hybrid_u8u32_dot_6x16 ( "cmp x27, #0x4\n" "blt 21f\n" "20:" // Height 1: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + "ldr s18, [x26], #0x4\n" + "ldr q16, [x10, #0x0]\n" + ".inst 0x6f92e208 // udot v8.4s, v16.16b, v18.4b[0]\n" "sub x27, x27, #0x4\n" - "ldr q7, [x10, #0x10]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x10]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6f92e209 // udot v9.4s, v16.16b, v18.4b[0]\n" "cmp x27, #0x4\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f92e22a // udot v10.4s, v17.16b, v18.4b[0]\n" + ".inst 0x6f92e20b // udot v11.4s, v16.16b, v18.4b[0]\n" "add x10, x10, #0x40\n" "bge 20b\n" "21:" // Height 1: Multiply loop: Skip odd blocks @@ -287,14 +286,14 @@ void a64_hybrid_u8u32_dot_6x16 ( "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b0, [x26, #0x0]\n" "23:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" "add x10, x10, #0x40\n" "24:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -465,12 +464,12 @@ void a64_hybrid_u8u32_dot_6x16 ( "48:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 49f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 50f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -478,7 +477,7 @@ void a64_hybrid_u8u32_dot_6x16 ( "b 50f\n" "49:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "50:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 53f\n" @@ -491,137 +490,137 @@ void a64_hybrid_u8u32_dot_6x16 ( "51:" // Height 2: Multiply loop: Main loop head ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "sub x27, x27, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x26, x26, #0x10\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x40]\n" "add x25, x25, #0x10\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x50]\n" "cmp x27, #0x20\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x70]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 51b\n" "52:" // Height 2: Multiply loop: Single iteration only ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q17, [x10, #0x20]\n" "add x26, x26, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q16, [x10, #0x30]\n" "add x25, x25, #0x10\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x40]\n" "sub x27, x27, #0x10\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x50]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x60]\n" + ".inst 0x6fa0e228 // udot v8.4s, v17.16b, v0.4b[1]\n" + ".inst 0x6fa1e22c // udot v12.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6fa0e209 // udot v9.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20d // udot v13.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x70]\n" + ".inst 0x6fa0e22a // udot v10.4s, v17.16b, v0.4b[1]\n" + ".inst 0x6fa1e22e // udot v14.4s, v17.16b, v1.4b[1]\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6fa0e20b // udot v11.4s, v16.16b, v0.4b[1]\n" + ".inst 0x6fa1e20f // udot v15.4s, v16.16b, v1.4b[1]\n" + "ldr q16, [x10, #0x90]\n" + ".inst 0x6f80ea28 // udot v8.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2c // udot v12.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xa0]\n" + ".inst 0x6f80ea09 // udot v9.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0d // udot v13.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xb0]\n" + ".inst 0x6f80ea2a // udot v10.4s, v17.16b, v0.4b[2]\n" + ".inst 0x6f81ea2e // udot v14.4s, v17.16b, v1.4b[2]\n" + "ldr q17, [x10, #0xc0]\n" + ".inst 0x6f80ea0b // udot v11.4s, v16.16b, v0.4b[2]\n" + ".inst 0x6f81ea0f // udot v15.4s, v16.16b, v1.4b[2]\n" + "ldr q16, [x10, #0xd0]\n" + ".inst 0x6fa0ea28 // udot v8.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2c // udot v12.4s, v17.16b, v1.4b[3]\n" + "ldr q17, [x10, #0xe0]\n" + ".inst 0x6fa0ea09 // udot v9.4s, v16.16b, v0.4b[3]\n" + ".inst 0x6fa1ea0d // udot v13.4s, v16.16b, v1.4b[3]\n" + "ldr q16, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa0ea2a // udot v10.4s, v17.16b, v0.4b[3]\n" + ".inst 0x6fa1ea2e // udot v14.4s, v17.16b, v1.4b[3]\n" + ".inst 0x6fa0ea0b // udot v11.4s, v16.16b, v0.4b[3]\n" + ".inst 0x6fa1ea0f // udot v15.4s, v16.16b, v1.4b[3]\n" "53:" // Height 2: Multiply loop: Main loop skip "cbz x27, 58f\n" "cmp x27, #0x4\n" "blt 55f\n" "54:" // Height 2: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s19, [x26], #0x4\n" + "ldr s18, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x6f93e228 // udot v8.4s, v17.16b, v19.4b[0]\n" + ".inst 0x6f92e22c // udot v12.4s, v17.16b, v18.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6f93e209 // udot v9.4s, v16.16b, v19.4b[0]\n" + ".inst 0x6f92e20d // udot v13.4s, v16.16b, v18.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f93e22a // udot v10.4s, v17.16b, v19.4b[0]\n" + ".inst 0x6f92e22e // udot v14.4s, v17.16b, v18.4b[0]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f93e20b // udot v11.4s, v16.16b, v19.4b[0]\n" + ".inst 0x6f92e20f // udot v15.4s, v16.16b, v18.4b[0]\n" "bge 54b\n" "55:" // Height 2: Multiply loop: Skip odd blocks "cbz x27, 58f\n" @@ -636,19 +635,19 @@ void a64_hybrid_u8u32_dot_6x16 ( "ldr b0, [x26, #0x0]\n" "ldr b1, [x25, #0x0]\n" "57:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q16, [x10, #0x10]\n" + ".inst 0x6f80e228 // udot v8.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f81e22c // udot v12.4s, v17.16b, v1.4b[0]\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6f80e209 // udot v9.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20d // udot v13.4s, v16.16b, v1.4b[0]\n" + "ldr q16, [x10, #0x30]\n" + ".inst 0x6f80e22a // udot v10.4s, v17.16b, v0.4b[0]\n" + ".inst 0x6f81e22e // udot v14.4s, v17.16b, v1.4b[0]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e20b // udot v11.4s, v16.16b, v0.4b[0]\n" + ".inst 0x6f81e20f // udot v15.4s, v16.16b, v1.4b[0]\n" "58:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -866,13 +865,13 @@ void a64_hybrid_u8u32_dot_6x16 ( "82:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 83f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 84f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -881,8 +880,8 @@ void a64_hybrid_u8u32_dot_6x16 ( "b 84f\n" "83:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "84:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 87f\n" @@ -899,75 +898,75 @@ void a64_hybrid_u8u32_dot_6x16 ( "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" "add x25, x25, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "add x24, x24, #0x10\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" "cmp x27, #0x20\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x50]\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x50]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" + ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 85b\n" @@ -977,98 +976,98 @@ void a64_hybrid_u8u32_dot_6x16 ( "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q21, [x10, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" "add x24, x24, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q20, [x10, #0x30]\n" "sub x27, x27, #0x10\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x40]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x50]\n" + ".inst 0x6fa0e2a8 // udot v8.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ac // udot v12.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b0 // udot v16.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x60]\n" + ".inst 0x6fa0e289 // udot v9.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28d // udot v13.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e291 // udot v17.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x70]\n" + ".inst 0x6fa0e2aa // udot v10.4s, v21.16b, v0.4b[1]\n" + ".inst 0x6fa1e2ae // udot v14.4s, v21.16b, v1.4b[1]\n" + ".inst 0x6fa2e2b2 // udot v18.4s, v21.16b, v2.4b[1]\n" + "ldr q21, [x10, #0x80]\n" + ".inst 0x6fa0e28b // udot v11.4s, v20.16b, v0.4b[1]\n" + ".inst 0x6fa1e28f // udot v15.4s, v20.16b, v1.4b[1]\n" + ".inst 0x6fa2e293 // udot v19.4s, v20.16b, v2.4b[1]\n" + "ldr q20, [x10, #0x90]\n" + ".inst 0x6f80eaa8 // udot v8.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaac // udot v12.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab0 // udot v16.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xa0]\n" + ".inst 0x6f80ea89 // udot v9.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8d // udot v13.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea91 // udot v17.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xb0]\n" + ".inst 0x6f80eaaa // udot v10.4s, v21.16b, v0.4b[2]\n" + ".inst 0x6f81eaae // udot v14.4s, v21.16b, v1.4b[2]\n" + ".inst 0x6f82eab2 // udot v18.4s, v21.16b, v2.4b[2]\n" + "ldr q21, [x10, #0xc0]\n" + ".inst 0x6f80ea8b // udot v11.4s, v20.16b, v0.4b[2]\n" + ".inst 0x6f81ea8f // udot v15.4s, v20.16b, v1.4b[2]\n" + ".inst 0x6f82ea93 // udot v19.4s, v20.16b, v2.4b[2]\n" + "ldr q20, [x10, #0xd0]\n" + ".inst 0x6fa0eaa8 // udot v8.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaac // udot v12.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab0 // udot v16.4s, v21.16b, v2.4b[3]\n" + "ldr q21, [x10, #0xe0]\n" + ".inst 0x6fa0ea89 // udot v9.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6fa1ea8d // udot v13.4s, v20.16b, v1.4b[3]\n" + ".inst 0x6fa2ea91 // udot v17.4s, v20.16b, v2.4b[3]\n" + "ldr q20, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa0eaaa // udot v10.4s, v21.16b, v0.4b[3]\n" + ".inst 0x6fa1eaae // udot v14.4s, v21.16b, v1.4b[3]\n" + ".inst 0x6fa2eab2 // udot v18.4s, v21.16b, v2.4b[3]\n" + ".inst 0x6fa0ea8b // udot v11.4s, v20.16b, v0.4b[3]\n" + ".inst 0x6fa1ea8f // udot v15.4s, v20.16b, v1.4b[3]\n" + ".inst 0x6fa2ea93 // udot v19.4s, v20.16b, v2.4b[3]\n" "87:" // Height 3: Multiply loop: Main loop skip "cbz x27, 92f\n" "cmp x27, #0x4\n" "blt 89f\n" "88:" // Height 3: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s24, [x26], #0x4\n" + "ldr s23, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s22, [x24], #0x4\n" + "ldr q21, [x10, #0x0]\n" + ".inst 0x6f98e2a8 // udot v8.4s, v21.16b, v24.4b[0]\n" + ".inst 0x6f97e2ac // udot v12.4s, v21.16b, v23.4b[0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x6f96e2b0 // udot v16.4s, v21.16b, v22.4b[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x6f98e289 // udot v9.4s, v20.16b, v24.4b[0]\n" + ".inst 0x6f97e28d // udot v13.4s, v20.16b, v23.4b[0]\n" + ".inst 0x6f96e291 // udot v17.4s, v20.16b, v22.4b[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f98e2aa // udot v10.4s, v21.16b, v24.4b[0]\n" + ".inst 0x6f97e2ae // udot v14.4s, v21.16b, v23.4b[0]\n" + ".inst 0x6f96e2b2 // udot v18.4s, v21.16b, v22.4b[0]\n" + ".inst 0x6f98e28b // udot v11.4s, v20.16b, v24.4b[0]\n" + ".inst 0x6f97e28f // udot v15.4s, v20.16b, v23.4b[0]\n" + ".inst 0x6f96e293 // udot v19.4s, v20.16b, v22.4b[0]\n" "bge 88b\n" "89:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 92f\n" @@ -1086,23 +1085,23 @@ void a64_hybrid_u8u32_dot_6x16 ( "ldr b1, [x25, #0x0]\n" "ldr b2, [x24, #0x0]\n" "91:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q21, [x10, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + ".inst 0x6f80e2a8 // udot v8.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f81e2ac // udot v12.4s, v21.16b, v1.4b[0]\n" + ".inst 0x6f82e2b0 // udot v16.4s, v21.16b, v2.4b[0]\n" + "ldr q21, [x10, #0x20]\n" + ".inst 0x6f80e289 // udot v9.4s, v20.16b, v0.4b[0]\n" + ".inst 0x6f81e28d // udot v13.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e291 // udot v17.4s, v20.16b, v2.4b[0]\n" + "ldr q20, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f80e2aa // udot v10.4s, v21.16b, v0.4b[0]\n" + ".inst 0x6f81e2ae // udot v14.4s, v21.16b, v1.4b[0]\n" + ".inst 0x6f82e2b2 // udot v18.4s, v21.16b, v2.4b[0]\n" + ".inst 0x6f80e28b // udot v11.4s, v20.16b, v0.4b[0]\n" + ".inst 0x6f81e28f // udot v15.4s, v20.16b, v1.4b[0]\n" + ".inst 0x6f82e293 // udot v19.4s, v20.16b, v2.4b[0]\n" "92:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1367,14 +1366,14 @@ void a64_hybrid_u8u32_dot_6x16 ( "116:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 117f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 118f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1384,9 +1383,9 @@ void a64_hybrid_u8u32_dot_6x16 ( "b 118f\n" "117:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "118:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 121f\n" @@ -1405,7 +1404,7 @@ void a64_hybrid_u8u32_dot_6x16 ( "add x26, x26, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x25, x25, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" @@ -1413,85 +1412,85 @@ void a64_hybrid_u8u32_dot_6x16 ( "add x23, x23, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "cmp x27, #0x20\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" + ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 119b\n" @@ -1502,7 +1501,7 @@ void a64_hybrid_u8u32_dot_6x16 ( "add x25, x25, #0x10\n" ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q25, [x10, #0x20]\n" "add x24, x24, #0x10\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" @@ -1510,112 +1509,112 @@ void a64_hybrid_u8u32_dot_6x16 ( "sub x27, x27, #0x10\n" ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q24, [x10, #0x30]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x40]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x40]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x50]\n" + ".inst 0x6fa0e328 // udot v8.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32c // udot v12.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e330 // udot v16.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e334 // udot v20.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x60]\n" + ".inst 0x6fa0e309 // udot v9.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30d // udot v13.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e311 // udot v17.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e315 // udot v21.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x6fa0e32a // udot v10.4s, v25.16b, v0.4b[1]\n" + ".inst 0x6fa1e32e // udot v14.4s, v25.16b, v1.4b[1]\n" + ".inst 0x6fa2e332 // udot v18.4s, v25.16b, v2.4b[1]\n" + ".inst 0x6fa3e336 // udot v22.4s, v25.16b, v3.4b[1]\n" + "ldr q25, [x10, #0x80]\n" + ".inst 0x6fa0e30b // udot v11.4s, v24.16b, v0.4b[1]\n" + ".inst 0x6fa1e30f // udot v15.4s, v24.16b, v1.4b[1]\n" + ".inst 0x6fa2e313 // udot v19.4s, v24.16b, v2.4b[1]\n" + ".inst 0x6fa3e317 // udot v23.4s, v24.16b, v3.4b[1]\n" + "ldr q24, [x10, #0x90]\n" + ".inst 0x6f80eb28 // udot v8.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2c // udot v12.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb30 // udot v16.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb34 // udot v20.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xa0]\n" + ".inst 0x6f80eb09 // udot v9.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0d // udot v13.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb11 // udot v17.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb15 // udot v21.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xb0]\n" + ".inst 0x6f80eb2a // udot v10.4s, v25.16b, v0.4b[2]\n" + ".inst 0x6f81eb2e // udot v14.4s, v25.16b, v1.4b[2]\n" + ".inst 0x6f82eb32 // udot v18.4s, v25.16b, v2.4b[2]\n" + ".inst 0x6f83eb36 // udot v22.4s, v25.16b, v3.4b[2]\n" + "ldr q25, [x10, #0xc0]\n" + ".inst 0x6f80eb0b // udot v11.4s, v24.16b, v0.4b[2]\n" + ".inst 0x6f81eb0f // udot v15.4s, v24.16b, v1.4b[2]\n" + ".inst 0x6f82eb13 // udot v19.4s, v24.16b, v2.4b[2]\n" + ".inst 0x6f83eb17 // udot v23.4s, v24.16b, v3.4b[2]\n" + "ldr q24, [x10, #0xd0]\n" + ".inst 0x6fa0eb28 // udot v8.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2c // udot v12.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb30 // udot v16.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb34 // udot v20.4s, v25.16b, v3.4b[3]\n" + "ldr q25, [x10, #0xe0]\n" + ".inst 0x6fa0eb09 // udot v9.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb0d // udot v13.4s, v24.16b, v1.4b[3]\n" + ".inst 0x6fa2eb11 // udot v17.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb15 // udot v21.4s, v24.16b, v3.4b[3]\n" + "ldr q24, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa0eb2a // udot v10.4s, v25.16b, v0.4b[3]\n" + ".inst 0x6fa1eb2e // udot v14.4s, v25.16b, v1.4b[3]\n" + ".inst 0x6fa2eb32 // udot v18.4s, v25.16b, v2.4b[3]\n" + ".inst 0x6fa3eb36 // udot v22.4s, v25.16b, v3.4b[3]\n" + ".inst 0x6fa0eb0b // udot v11.4s, v24.16b, v0.4b[3]\n" + ".inst 0x6fa1eb0f // udot v15.4s, v24.16b, v1.4b[3]\n" + ".inst 0x6fa2eb13 // udot v19.4s, v24.16b, v2.4b[3]\n" + ".inst 0x6fa3eb17 // udot v23.4s, v24.16b, v3.4b[3]\n" "121:" // Height 4: Multiply loop: Main loop skip "cbz x27, 126f\n" "cmp x27, #0x4\n" "blt 123f\n" "122:" // Height 4: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s29, [x26], #0x4\n" + "ldr s28, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s27, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x6f9de328 // udot v8.4s, v25.16b, v29.4b[0]\n" + ".inst 0x6f9ce32c // udot v12.4s, v25.16b, v28.4b[0]\n" + ".inst 0x6f9be330 // udot v16.4s, v25.16b, v27.4b[0]\n" + ".inst 0x6f9ae334 // udot v20.4s, v25.16b, v26.4b[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x6f9de309 // udot v9.4s, v24.16b, v29.4b[0]\n" + ".inst 0x6f9ce30d // udot v13.4s, v24.16b, v28.4b[0]\n" + ".inst 0x6f9be311 // udot v17.4s, v24.16b, v27.4b[0]\n" + ".inst 0x6f9ae315 // udot v21.4s, v24.16b, v26.4b[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f9de32a // udot v10.4s, v25.16b, v29.4b[0]\n" + ".inst 0x6f9ce32e // udot v14.4s, v25.16b, v28.4b[0]\n" + ".inst 0x6f9be332 // udot v18.4s, v25.16b, v27.4b[0]\n" + ".inst 0x6f9ae336 // udot v22.4s, v25.16b, v26.4b[0]\n" + ".inst 0x6f9de30b // udot v11.4s, v24.16b, v29.4b[0]\n" + ".inst 0x6f9ce30f // udot v15.4s, v24.16b, v28.4b[0]\n" + ".inst 0x6f9be313 // udot v19.4s, v24.16b, v27.4b[0]\n" + ".inst 0x6f9ae317 // udot v23.4s, v24.16b, v26.4b[0]\n" "bge 122b\n" "123:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 126f\n" @@ -1636,27 +1635,27 @@ void a64_hybrid_u8u32_dot_6x16 ( "ldr b2, [x24, #0x0]\n" "ldr b3, [x23, #0x0]\n" "125:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q25, [x10, #0x0]\n" + "ldr q24, [x10, #0x10]\n" + ".inst 0x6f80e328 // udot v8.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e32c // udot v12.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f82e330 // udot v16.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e334 // udot v20.4s, v25.16b, v3.4b[0]\n" + "ldr q25, [x10, #0x20]\n" + ".inst 0x6f80e309 // udot v9.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30d // udot v13.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e311 // udot v17.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e315 // udot v21.4s, v24.16b, v3.4b[0]\n" + "ldr q24, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f80e32a // udot v10.4s, v25.16b, v0.4b[0]\n" + ".inst 0x6f81e32e // udot v14.4s, v25.16b, v1.4b[0]\n" + ".inst 0x6f82e332 // udot v18.4s, v25.16b, v2.4b[0]\n" + ".inst 0x6f83e336 // udot v22.4s, v25.16b, v3.4b[0]\n" + ".inst 0x6f80e30b // udot v11.4s, v24.16b, v0.4b[0]\n" + ".inst 0x6f81e30f // udot v15.4s, v24.16b, v1.4b[0]\n" + ".inst 0x6f82e313 // udot v19.4s, v24.16b, v2.4b[0]\n" + ".inst 0x6f83e317 // udot v23.4s, v24.16b, v3.4b[0]\n" "126:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1960,162 +1959,162 @@ void a64_hybrid_u8u32_dot_6x16 ( "movi v22.4s, #0x0\n" "movi v23.4s, #0x0\n" "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "149:" // Height 5: setup done - "mov x28, #0x0\n" - "150:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 151f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "cbnz x28, 152f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20\n" - "add x25, x25, x20\n" - "add x24, x24, x20\n" - "add x23, x23, x20\n" - "add x22, x22, x20\n" - "b 152f\n" - "151:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "152:" // Height 5: input setup done - "cmp x27, #0x10\n" - "blt 155f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x20\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "blt 154f\n" - "153:" // Height 5: Multiply loop: Main loop head - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "sub x27, x27, #0x10\n" - "add x26, x26, #0x10\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - "add x23, x23, #0x10\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x20\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" - "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "149:" // Height 5: setup done + "mov x28, #0x0\n" + "150:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 151f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 152f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20\n" + "add x25, x25, x20\n" + "add x24, x24, x20\n" + "add x23, x23, x20\n" + "add x22, x22, x20\n" + "b 152f\n" + "151:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "152:" // Height 5: input setup done + "cmp x27, #0x10\n" + "blt 155f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x20\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 154f\n" + "153:" // Height 5: Multiply loop: Main loop head + ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" + "sub x27, x27, #0x10\n" + "add x26, x26, #0x10\n" + ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" + "add x23, x23, #0x10\n" + ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x20\n" + ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" + ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n" "ldr q6, [x10, #0x0]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" + ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n" "ldr q0, [x26, #0x0]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" + ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n" "ldr q1, [x25, #0x0]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" + ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n" "ldr q2, [x24, #0x0]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" + ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n" "ldr q3, [x23, #0x0]\n" - ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n" "ldr q4, [x22, #0x0]\n" "ldr q7, [x10, #0x10]\n" "bge 153b\n" @@ -2129,7 +2128,7 @@ void a64_hybrid_u8u32_dot_6x16 ( "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" + "ldr q29, [x10, #0x20]\n" ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" "add x22, x22, #0x10\n" ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" @@ -2138,131 +2137,131 @@ void a64_hybrid_u8u32_dot_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q28, [x10, #0x30]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n" - "ldr q6, [x10, #0x80]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n" - ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n" - ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n" - "ldr q7, [x10, #0x90]\n" - ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xa0]\n" - ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xb0]\n" - ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n" - "ldr q6, [x10, #0xc0]\n" - ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n" - ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n" - ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n" - "ldr q7, [x10, #0xd0]\n" - ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n" - "ldr q6, [x10, #0xe0]\n" - ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n" - "ldr q7, [x10, #0xf0]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x40]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x50]\n" + ".inst 0x6fa0e3a8 // udot v8.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ac // udot v12.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b0 // udot v16.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b4 // udot v20.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3b8 // udot v24.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x60]\n" + ".inst 0x6fa0e389 // udot v9.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38d // udot v13.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e391 // udot v17.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e395 // udot v21.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x70]\n" + ".inst 0x6fa0e3aa // udot v10.4s, v29.16b, v0.4b[1]\n" + ".inst 0x6fa1e3ae // udot v14.4s, v29.16b, v1.4b[1]\n" + ".inst 0x6fa2e3b2 // udot v18.4s, v29.16b, v2.4b[1]\n" + ".inst 0x6fa3e3b6 // udot v22.4s, v29.16b, v3.4b[1]\n" + ".inst 0x6fa4e3ba // udot v26.4s, v29.16b, v4.4b[1]\n" + "ldr q29, [x10, #0x80]\n" + ".inst 0x6fa0e38b // udot v11.4s, v28.16b, v0.4b[1]\n" + ".inst 0x6fa1e38f // udot v15.4s, v28.16b, v1.4b[1]\n" + ".inst 0x6fa2e393 // udot v19.4s, v28.16b, v2.4b[1]\n" + ".inst 0x6fa3e397 // udot v23.4s, v28.16b, v3.4b[1]\n" + ".inst 0x6fa4e39b // udot v27.4s, v28.16b, v4.4b[1]\n" + "ldr q28, [x10, #0x90]\n" + ".inst 0x6f80eba8 // udot v8.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebac // udot v12.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb0 // udot v16.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb4 // udot v20.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebb8 // udot v24.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xa0]\n" + ".inst 0x6f80eb89 // udot v9.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8d // udot v13.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb91 // udot v17.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb95 // udot v21.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb99 // udot v25.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xb0]\n" + ".inst 0x6f80ebaa // udot v10.4s, v29.16b, v0.4b[2]\n" + ".inst 0x6f81ebae // udot v14.4s, v29.16b, v1.4b[2]\n" + ".inst 0x6f82ebb2 // udot v18.4s, v29.16b, v2.4b[2]\n" + ".inst 0x6f83ebb6 // udot v22.4s, v29.16b, v3.4b[2]\n" + ".inst 0x6f84ebba // udot v26.4s, v29.16b, v4.4b[2]\n" + "ldr q29, [x10, #0xc0]\n" + ".inst 0x6f80eb8b // udot v11.4s, v28.16b, v0.4b[2]\n" + ".inst 0x6f81eb8f // udot v15.4s, v28.16b, v1.4b[2]\n" + ".inst 0x6f82eb93 // udot v19.4s, v28.16b, v2.4b[2]\n" + ".inst 0x6f83eb97 // udot v23.4s, v28.16b, v3.4b[2]\n" + ".inst 0x6f84eb9b // udot v27.4s, v28.16b, v4.4b[2]\n" + "ldr q28, [x10, #0xd0]\n" + ".inst 0x6fa0eba8 // udot v8.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebac // udot v12.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb0 // udot v16.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb4 // udot v20.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebb8 // udot v24.4s, v29.16b, v4.4b[3]\n" + "ldr q29, [x10, #0xe0]\n" + ".inst 0x6fa0eb89 // udot v9.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb8d // udot v13.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb91 // udot v17.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb95 // udot v21.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb99 // udot v25.4s, v28.16b, v4.4b[3]\n" + "ldr q28, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n" - ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n" - ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n" - ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n" - ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n" + ".inst 0x6fa0ebaa // udot v10.4s, v29.16b, v0.4b[3]\n" + ".inst 0x6fa1ebae // udot v14.4s, v29.16b, v1.4b[3]\n" + ".inst 0x6fa2ebb2 // udot v18.4s, v29.16b, v2.4b[3]\n" + ".inst 0x6fa3ebb6 // udot v22.4s, v29.16b, v3.4b[3]\n" + ".inst 0x6fa4ebba // udot v26.4s, v29.16b, v4.4b[3]\n" + ".inst 0x6fa0eb8b // udot v11.4s, v28.16b, v0.4b[3]\n" + ".inst 0x6fa1eb8f // udot v15.4s, v28.16b, v1.4b[3]\n" + ".inst 0x6fa2eb93 // udot v19.4s, v28.16b, v2.4b[3]\n" + ".inst 0x6fa3eb97 // udot v23.4s, v28.16b, v3.4b[3]\n" + ".inst 0x6fa4eb9b // udot v27.4s, v28.16b, v4.4b[3]\n" "155:" // Height 5: Multiply loop: Main loop skip "cbz x27, 160f\n" "cmp x27, #0x4\n" "blt 157f\n" "156:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" + "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr q6, [x10, #0x0]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s0, [x24], #0x4\n" + "ldr s31, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "ldr q29, [x10, #0x0]\n" + ".inst 0x6f82e3a8 // udot v8.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x6f80e3b0 // udot v16.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f9fe3b4 // udot v20.4s, v29.16b, v31.4b[0]\n" + ".inst 0x6f9ee3b8 // udot v24.4s, v29.16b, v30.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x6f82e389 // udot v9.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f80e391 // udot v17.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f9fe395 // udot v21.4s, v28.16b, v31.4b[0]\n" + ".inst 0x6f9ee399 // udot v25.4s, v28.16b, v30.4b[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f82e3aa // udot v10.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f80e3b2 // udot v18.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f9fe3b6 // udot v22.4s, v29.16b, v31.4b[0]\n" + ".inst 0x6f9ee3ba // udot v26.4s, v29.16b, v30.4b[0]\n" + ".inst 0x6f82e38b // udot v11.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f80e393 // udot v19.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f9fe397 // udot v23.4s, v28.16b, v31.4b[0]\n" + ".inst 0x6f9ee39b // udot v27.4s, v28.16b, v30.4b[0]\n" "bge 156b\n" "157:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 160f\n" @@ -2286,31 +2285,31 @@ void a64_hybrid_u8u32_dot_6x16 ( "ldr b3, [x23, #0x0]\n" "ldr b4, [x22, #0x0]\n" "159:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q29, [x10, #0x0]\n" + "ldr q28, [x10, #0x10]\n" + ".inst 0x6f80e3a8 // udot v8.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3ac // udot v12.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3b0 // udot v16.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b4 // udot v20.4s, v29.16b, v3.4b[0]\n" + ".inst 0x6f84e3b8 // udot v24.4s, v29.16b, v4.4b[0]\n" + "ldr q29, [x10, #0x20]\n" + ".inst 0x6f80e389 // udot v9.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38d // udot v13.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e391 // udot v17.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e395 // udot v21.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e399 // udot v25.4s, v28.16b, v4.4b[0]\n" + "ldr q28, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f80e3aa // udot v10.4s, v29.16b, v0.4b[0]\n" + ".inst 0x6f81e3ae // udot v14.4s, v29.16b, v1.4b[0]\n" + ".inst 0x6f82e3b2 // udot v18.4s, v29.16b, v2.4b[0]\n" + ".inst 0x6f83e3b6 // udot v22.4s, v29.16b, v3.4b[0]\n" + ".inst 0x6f84e3ba // udot v26.4s, v29.16b, v4.4b[0]\n" + ".inst 0x6f80e38b // udot v11.4s, v28.16b, v0.4b[0]\n" + ".inst 0x6f81e38f // udot v15.4s, v28.16b, v1.4b[0]\n" + ".inst 0x6f82e393 // udot v19.4s, v28.16b, v2.4b[0]\n" + ".inst 0x6f83e397 // udot v23.4s, v28.16b, v3.4b[0]\n" + ".inst 0x6f84e39b // udot v27.4s, v28.16b, v4.4b[0]\n" "160:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2672,16 +2671,16 @@ void a64_hybrid_u8u32_dot_6x16 ( "184:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 185f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 186f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2693,11 +2692,11 @@ void a64_hybrid_u8u32_dot_6x16 ( "b 186f\n" "185:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "186:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 189f\n" @@ -2976,43 +2975,43 @@ void a64_hybrid_u8u32_dot_6x16 ( "cmp x27, #0x4\n" "blt 191f\n" "190:" // Height 6: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s1, [x25], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s6, [x25], #0x4\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "ldr s2, [x24], #0x4\n" - "ldr s3, [x23], #0x4\n" - "ldr s4, [x22], #0x4\n" - "ldr s5, [x21], #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr s5, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s3, [x22], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6f87e028 // udot v8.4s, v1.16b, v7.4b[0]\n" + ".inst 0x6f86e02c // udot v12.4s, v1.16b, v6.4b[0]\n" + ".inst 0x6f85e030 // udot v16.4s, v1.16b, v5.4b[0]\n" + ".inst 0x6f84e034 // udot v20.4s, v1.16b, v4.4b[0]\n" + ".inst 0x6f83e038 // udot v24.4s, v1.16b, v3.4b[0]\n" + ".inst 0x6f82e03c // udot v28.4s, v1.16b, v2.4b[0]\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x6f87e009 // udot v9.4s, v0.16b, v7.4b[0]\n" + ".inst 0x6f86e00d // udot v13.4s, v0.16b, v6.4b[0]\n" + ".inst 0x6f85e011 // udot v17.4s, v0.16b, v5.4b[0]\n" + ".inst 0x6f84e015 // udot v21.4s, v0.16b, v4.4b[0]\n" + ".inst 0x6f83e019 // udot v25.4s, v0.16b, v3.4b[0]\n" + ".inst 0x6f82e01d // udot v29.4s, v0.16b, v2.4b[0]\n" + "ldr q0, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f87e02a // udot v10.4s, v1.16b, v7.4b[0]\n" + ".inst 0x6f86e02e // udot v14.4s, v1.16b, v6.4b[0]\n" + ".inst 0x6f85e032 // udot v18.4s, v1.16b, v5.4b[0]\n" + ".inst 0x6f84e036 // udot v22.4s, v1.16b, v4.4b[0]\n" + ".inst 0x6f83e03a // udot v26.4s, v1.16b, v3.4b[0]\n" + ".inst 0x6f82e03e // udot v30.4s, v1.16b, v2.4b[0]\n" + ".inst 0x6f87e00b // udot v11.4s, v0.16b, v7.4b[0]\n" + ".inst 0x6f86e00f // udot v15.4s, v0.16b, v6.4b[0]\n" + ".inst 0x6f85e013 // udot v19.4s, v0.16b, v5.4b[0]\n" + ".inst 0x6f84e017 // udot v23.4s, v0.16b, v4.4b[0]\n" + ".inst 0x6f83e01b // udot v27.4s, v0.16b, v3.4b[0]\n" + ".inst 0x6f82e01f // udot v31.4s, v0.16b, v2.4b[0]\n" "bge 190b\n" "191:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 194f\n" @@ -3039,35 +3038,35 @@ void a64_hybrid_u8u32_dot_6x16 ( "ldr b4, [x22, #0x0]\n" "ldr b5, [x21, #0x0]\n" "193:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n" - "ldr q7, [x10, #0x30]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ec // udot v12.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f0 // udot v16.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f4 // udot v20.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0f8 // udot v24.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fc // udot v28.4s, v7.16b, v5.4b[0]\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cd // udot v13.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d1 // udot v17.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d5 // udot v21.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0d9 // udot v25.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0dd // udot v29.4s, v6.16b, v5.4b[0]\n" + "ldr q6, [x10, #0x30]\n" "add x10, x10, #0x40\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n" - ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n" - ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n" - ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n" - ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n" - ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n" + ".inst 0x6f81e0ee // udot v14.4s, v7.16b, v1.4b[0]\n" + ".inst 0x6f82e0f2 // udot v18.4s, v7.16b, v2.4b[0]\n" + ".inst 0x6f83e0f6 // udot v22.4s, v7.16b, v3.4b[0]\n" + ".inst 0x6f84e0fa // udot v26.4s, v7.16b, v4.4b[0]\n" + ".inst 0x6f85e0fe // udot v30.4s, v7.16b, v5.4b[0]\n" + ".inst 0x6f80e0cb // udot v11.4s, v6.16b, v0.4b[0]\n" + ".inst 0x6f81e0cf // udot v15.4s, v6.16b, v1.4b[0]\n" + ".inst 0x6f82e0d3 // udot v19.4s, v6.16b, v2.4b[0]\n" + ".inst 0x6f83e0d7 // udot v23.4s, v6.16b, v3.4b[0]\n" + ".inst 0x6f84e0db // udot v27.4s, v6.16b, v4.4b[0]\n" + ".inst 0x6f85e0df // udot v31.4s, v6.16b, v5.4b[0]\n" "194:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3254,7 +3253,6 @@ void a64_hybrid_u8u32_dot_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "206:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp index b5cedc7e98..e360452108 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -109,5 +109,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp index dd0c46e4dc..364f388e79 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp @@ -77,7 +77,6 @@ void a64_hybrid_u8u32_mmla_6x16 ( ka.N = N; ka.B_ptr = B_ptr; __asm__ __volatile__( - "1:" // Row loop "cmp %x[M], #0x6\n" "bge 186f\n" @@ -178,11 +177,11 @@ void a64_hybrid_u8u32_mmla_6x16 ( "15:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -198,41 +197,41 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 19f\n" "18:" // Height 1: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v19.2d, v1.2d, v20.2d\n" + ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v20.2d\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "cmp x27, #0x20\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n" + ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n" "ldr q1, [x26, #0x0]\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" @@ -240,40 +239,40 @@ void a64_hybrid_u8u32_mmla_6x16 ( "prfm pldl1keep, [x26, #0x80]\n" "bge 18b\n" "19:" // Height 1: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "trn1 v20.2d, v1.2d, v21.2d\n" + ".inst 0x6e87a688 // ummla v8.4s, v20.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e86a68c // ummla v12.4s, v20.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e92a689 // ummla v9.4s, v20.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e91a68d // ummla v13.4s, v20.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a68a // ummla v10.4s, v20.16b, v18.16b\n" + "ldr q19, [x10, #0x60]\n" + ".inst 0x6e91a68e // ummla v14.4s, v20.16b, v17.16b\n" + "ldr q18, [x10, #0x70]\n" + "trn2 v1.2d, v1.2d, v21.2d\n" + ".inst 0x6e93a68b // ummla v11.4s, v20.16b, v19.16b\n" + "ldr q17, [x10, #0x80]\n" + ".inst 0x6e92a68f // ummla v15.4s, v20.16b, v18.16b\n" + "ldr q19, [x10, #0x90]\n" + ".inst 0x6e91a428 // ummla v8.4s, v1.16b, v17.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e93a42c // ummla v12.4s, v1.16b, v19.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n" + ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n" "prfm pldl1keep, [x26, #0x80]\n" "add x10, x10, #0x100\n" "20:" // Height 1: Multiply loop: Main loop skip @@ -281,26 +280,26 @@ void a64_hybrid_u8u32_mmla_6x16 ( "cmp x27, #0x8\n" "blt 22f\n" "21:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + "ldr d19, [x26], #0x8\n" + "ldr q18, [x10, #0x0]\n" + "trn1 v19.2d, v19.2d, v17.2d\n" + "ldr q17, [x10, #0x10]\n" + ".inst 0x6e92a668 // ummla v8.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e91a66c // ummla v12.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "bge 21b\n" "22:" // Height 1: Multiply loop: Skip odd blocks @@ -325,23 +324,23 @@ void a64_hybrid_u8u32_mmla_6x16 ( "25:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 "ldr b1, [x26, #0x0]\n" "26:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q23, [x10, #0x0]\n" + "ldr q18, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v17.2d\n" + ".inst 0x6e97a668 // ummla v8.4s, v19.16b, v23.16b\n" + "ldr q17, [x10, #0x20]\n" + ".inst 0x6e92a66c // ummla v12.4s, v19.16b, v18.16b\n" + "ldr q31, [x10, #0x30]\n" + ".inst 0x6e91a669 // ummla v9.4s, v19.16b, v17.16b\n" + "ldr q20, [x10, #0x40]\n" + ".inst 0x6e9fa66d // ummla v13.4s, v19.16b, v31.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e94a66a // ummla v10.4s, v19.16b, v20.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "27:" // Height 1: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -525,12 +524,12 @@ void a64_hybrid_u8u32_mmla_6x16 ( "52:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 53f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 54f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -538,7 +537,7 @@ void a64_hybrid_u8u32_mmla_6x16 ( "b 54f\n" "53:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "54:" // Height 2: input setup done "cmp x27, #0x10\n" "blt 57f\n" @@ -549,85 +548,85 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 56f\n" "55:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "sub x27, x27, #0x10\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "ldr q2, [x25, #0x0]\n" "cmp x27, #0x20\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" + ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n" "add x10, x10, #0x100\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n" "ldr q1, [x26, #0x0]\n" "ldr q6, [x10, #0x10]\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" "bge 55b\n" "56:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e87a668 // ummla v8.4s, v19.16b, v7.16b\n" + "ldr q18, [x10, #0x20]\n" + ".inst 0x6e86a66c // ummla v12.4s, v19.16b, v6.16b\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e92a669 // ummla v9.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x80]\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x90]\n" + ".inst 0x6e92a428 // ummla v8.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xa0]\n" + ".inst 0x6e91a42c // ummla v12.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xb0]\n" + ".inst 0x6e92a429 // ummla v9.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xc0]\n" + ".inst 0x6e91a42d // ummla v13.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xd0]\n" + ".inst 0x6e92a42a // ummla v10.4s, v1.16b, v18.16b\n" + "ldr q18, [x10, #0xe0]\n" + ".inst 0x6e91a42e // ummla v14.4s, v1.16b, v17.16b\n" + "ldr q17, [x10, #0xf0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e92a42b // ummla v11.4s, v1.16b, v18.16b\n" + ".inst 0x6e91a42f // ummla v15.4s, v1.16b, v17.16b\n" "sub x27, x27, #0x10\n" "prfm pldl1keep, [x26, #0x80]\n" "prfm pldl1keep, [x25, #0x80]\n" @@ -637,27 +636,27 @@ void a64_hybrid_u8u32_mmla_6x16 ( "cmp x27, #0x8\n" "blt 59f\n" "58:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d18, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "trn1 v19.2d, v18.2d, v17.2d\n" "sub x27, x27, #0x8\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x20]\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x40]\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - "ldr q6, [x10, #0x60]\n" - "ldr q7, [x10, #0x70]\n" + "ldr q17, [x10, #0x0]\n" + "ldr q22, [x10, #0x10]\n" + ".inst 0x6e91a668 // ummla v8.4s, v19.16b, v17.16b\n" + ".inst 0x6e96a66c // ummla v12.4s, v19.16b, v22.16b\n" + "ldr q1, [x10, #0x20]\n" + "ldr q17, [x10, #0x30]\n" + ".inst 0x6e81a669 // ummla v9.4s, v19.16b, v1.16b\n" + ".inst 0x6e91a66d // ummla v13.4s, v19.16b, v17.16b\n" + "ldr q18, [x10, #0x40]\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q18, [x10, #0x60]\n" + "ldr q17, [x10, #0x70]\n" "cmp x27, #0x8\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "bge 58b\n" "59:" // Height 2: Multiply loop: Skip odd blocks @@ -689,23 +688,23 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr b1, [x26, #0x0]\n" "ldr b2, [x25, #0x0]\n" "63:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + "ldr q18, [x10, #0x0]\n" + "ldr q17, [x10, #0x10]\n" + "trn1 v19.2d, v1.2d, v2.2d\n" + ".inst 0x6e92a668 // ummla v8.4s, v19.16b, v18.16b\n" + "ldr q5, [x10, #0x20]\n" + ".inst 0x6e91a66c // ummla v12.4s, v19.16b, v17.16b\n" + "ldr q21, [x10, #0x30]\n" + ".inst 0x6e85a669 // ummla v9.4s, v19.16b, v5.16b\n" + "ldr q18, [x10, #0x40]\n" + ".inst 0x6e95a66d // ummla v13.4s, v19.16b, v21.16b\n" + "ldr q17, [x10, #0x50]\n" + ".inst 0x6e92a66a // ummla v10.4s, v19.16b, v18.16b\n" + "ldr q18, [x10, #0x60]\n" + ".inst 0x6e91a66e // ummla v14.4s, v19.16b, v17.16b\n" + "ldr q17, [x10, #0x70]\n" + ".inst 0x6e92a66b // ummla v11.4s, v19.16b, v18.16b\n" + ".inst 0x6e91a66f // ummla v15.4s, v19.16b, v17.16b\n" "add x10, x10, #0x80\n" "64:" // Height 2: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -953,13 +952,13 @@ void a64_hybrid_u8u32_mmla_6x16 ( "89:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 90f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 91f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -968,8 +967,8 @@ void a64_hybrid_u8u32_mmla_6x16 ( "b 91f\n" "90:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "91:" // Height 3: input setup done "cmp x27, #0x10\n" "blt 94f\n" @@ -981,167 +980,167 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 93f\n" "92:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n" + ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" "add x26, x26, #0x10\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" "cmp x27, #0x20\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 92b\n" "93:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v29.2d\n" + ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n" + ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" + "trn2 v3.2d, v3.2d, v29.2d\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" "add x26, x26, #0x10\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" "add x25, x25, #0x10\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n" + ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n" "94:" // Height 3: Multiply loop: Main loop skip "cbz x27, 101f\n" "cmp x27, #0x8\n" "blt 96f\n" "95:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" + "ldr d25, [x24], #0x8\n" + "ldr q26, [x10, #0x0]\n" + "trn1 v27.2d, v25.2d, v27.2d\n" + ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x8\n" - ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" "cmp x27, #0x8\n" - ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" "add x10, x10, #0x80\n" - ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" "bge 95b\n" "96:" // Height 3: Multiply loop: Skip odd blocks "cbz x27, 101f\n" @@ -1179,33 +1178,33 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr b2, [x25, #0x0]\n" "ldr b3, [x24, #0x0]\n" "100:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q29, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v25.2d\n" + ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e9da78c // ummla v12.4s, v28.16b, v29.16b\n" + ".inst 0x6e9da774 // ummla v20.4s, v27.16b, v29.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" "101:" // Height 3: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1499,14 +1498,14 @@ void a64_hybrid_u8u32_mmla_6x16 ( "126:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 127f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 128f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1516,9 +1515,9 @@ void a64_hybrid_u8u32_mmla_6x16 ( "b 128f\n" "127:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "128:" // Height 4: input setup done "cmp x27, #0x10\n" "blt 131f\n" @@ -1531,173 +1530,173 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q6, [x10, #0x10]\n" "blt 130f\n" "129:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n" "sub x27, x27, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n" + ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" "add x26, x26, #0x10\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" "add x25, x25, #0x10\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" "add x23, x23, #0x10\n" "ldr q4, [x23, #0x0]\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n" "cmp x27, #0x20\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" + ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n" "ldr q3, [x24, #0x0]\n" "ldr q6, [x10, #0x10]\n" "bge 129b\n" "130:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v28.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a788 // ummla v8.4s, v28.16b, v7.16b\n" "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e87a770 // ummla v16.4s, v27.16b, v7.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e86a78c // ummla v12.4s, v28.16b, v6.16b\n" + ".inst 0x6e86a774 // ummla v20.4s, v27.16b, v6.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" "add x25, x25, #0x10\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" "add x24, x24, #0x10\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" "add x23, x23, #0x10\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" "sub x27, x27, #0x10\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x80]\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x90]\n" + ".inst 0x6e9aa428 // ummla v8.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" + ".inst 0x6e9aa470 // ummla v16.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xa0]\n" + ".inst 0x6e99a42c // ummla v12.4s, v1.16b, v25.16b\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" + ".inst 0x6e99a474 // ummla v20.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xb0]\n" + ".inst 0x6e9aa429 // ummla v9.4s, v1.16b, v26.16b\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + ".inst 0x6e9aa471 // ummla v17.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xc0]\n" + ".inst 0x6e99a42d // ummla v13.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a475 // ummla v21.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xd0]\n" + ".inst 0x6e9aa42a // ummla v10.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa472 // ummla v18.4s, v3.16b, v26.16b\n" + "ldr q26, [x10, #0xe0]\n" + ".inst 0x6e99a42e // ummla v14.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a476 // ummla v22.4s, v3.16b, v25.16b\n" + "ldr q25, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e9aa42b // ummla v11.4s, v1.16b, v26.16b\n" + ".inst 0x6e9aa473 // ummla v19.4s, v3.16b, v26.16b\n" + ".inst 0x6e99a42f // ummla v15.4s, v1.16b, v25.16b\n" + ".inst 0x6e99a477 // ummla v23.4s, v3.16b, v25.16b\n" "131:" // Height 4: Multiply loop: Main loop skip "cbz x27, 138f\n" "cmp x27, #0x8\n" "blt 133f\n" "132:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d26, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "trn1 v28.2d, v26.2d, v25.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d26, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "trn1 v27.2d, v26.2d, v25.2d\n" "cmp x27, #0x8\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" "bge 132b\n" "133:" // Height 4: Multiply loop: Skip odd blocks "cbz x27, 138f\n" @@ -1742,33 +1741,33 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr b3, [x24, #0x0]\n" "ldr b4, [x23, #0x0]\n" "137:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" + "ldr q26, [x10, #0x0]\n" + "ldr q25, [x10, #0x10]\n" + "trn1 v28.2d, v1.2d, v2.2d\n" + "trn1 v27.2d, v3.2d, v4.2d\n" + ".inst 0x6e9aa788 // ummla v8.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa770 // ummla v16.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x20]\n" + ".inst 0x6e99a78c // ummla v12.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a774 // ummla v20.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x30]\n" + ".inst 0x6e9aa789 // ummla v9.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa771 // ummla v17.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x40]\n" + ".inst 0x6e99a78d // ummla v13.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a775 // ummla v21.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x50]\n" + ".inst 0x6e9aa78a // ummla v10.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa772 // ummla v18.4s, v27.16b, v26.16b\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x6e99a78e // ummla v14.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a776 // ummla v22.4s, v27.16b, v25.16b\n" + "ldr q25, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e9aa78b // ummla v11.4s, v28.16b, v26.16b\n" + ".inst 0x6e9aa773 // ummla v19.4s, v27.16b, v26.16b\n" + ".inst 0x6e99a78f // ummla v15.4s, v28.16b, v25.16b\n" + ".inst 0x6e99a777 // ummla v23.4s, v27.16b, v25.16b\n" "138:" // Height 4: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2125,15 +2124,15 @@ void a64_hybrid_u8u32_mmla_6x16 ( "163:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 164f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 165f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2144,10 +2143,10 @@ void a64_hybrid_u8u32_mmla_6x16 ( "b 165f\n" "164:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "165:" // Height 5: input setup done "cmp x27, #0x10\n" "blt 168f\n" @@ -2160,174 +2159,174 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q7, [x10, #0x0]\n" "blt 167f\n" "166:" // Height 5: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" "sub x27, x27, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n" "add x26, x26, #0x10\n" - ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n" "add x25, x25, #0x10\n" ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x40]\n" "add x24, x24, #0x10\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n" "cmp x27, #0x20\n" ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n" "prfm pldl1keep, [x25, #0x80]\n" "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x23, #0x80]\n" ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x80]\n" "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4cf // ummla v15.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a457 // ummla v23.4s, v2.16b, v0.16b\n" "ldr q2, [x25, #0x0]\n" - ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x6e80a42c // ummla v12.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bc // ummla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x6e86a429 // ummla v9.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a471 // ummla v17.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4b9 // ummla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x6e80a42d // ummla v13.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bd // ummla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x6e86a42a // ummla v10.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a472 // ummla v18.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4ba // ummla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x6e80a42e // ummla v14.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4be // ummla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42b // ummla v11.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bb // ummla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e80a42f // ummla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + ".inst 0x6e80a4bf // ummla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "bge 166b\n" "167:" // Height 5: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v6.2d, v1.2d, v2.2d\n" "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a4c8 // ummla v8.4s, v6.16b, v7.16b\n" "trn1 v2.2d, v3.2d, v4.2d\n" "trn2 v3.2d, v3.2d, v4.2d\n" ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" "add x26, x26, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" + "trn1 v4.2d, v5.2d, v0.2d\n" + "trn2 v5.2d, v5.2d, v0.2d\n" + "ldr q0, [x10, #0x10]\n" ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4cc // ummla v12.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a454 // ummla v20.4s, v2.16b, v0.16b\n" "add x25, x25, #0x10\n" - ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49c // ummla v28.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e87a4c9 // ummla v9.4s, v6.16b, v7.16b\n" "add x24, x24, #0x10\n" ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x40]\n" "add x23, x23, #0x10\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4cd // ummla v13.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a455 // ummla v21.4s, v2.16b, v0.16b\n" "add x22, x22, #0x10\n" "sub x27, x27, #0x10\n" - ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e87a4ca // ummla v10.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x26, #0x80]\n" ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x60]\n" "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a4ce // ummla v14.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a456 // ummla v22.4s, v2.16b, v0.16b\n" "prfm pldl1keep, [x24, #0x80]\n" "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a49e // ummla v30.4s, v4.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e87a4cb // ummla v11.4s, v6.16b, v7.16b\n" "prfm pldl1keep, [x22, #0x80]\n" ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" "ldr q7, [x10, #0x80]\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + ".inst 0x6e80a4cf // ummla v15.4s, v6.16b, v0.16b\n" + ".inst 0x6e80a457 // ummla v23.4s, v2.16b, v0.16b\n" + ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x6e82a42c // ummla v12.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a474 // ummla v20.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4bc // ummla v28.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x6e80a429 // ummla v9.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x6e82a42d // ummla v13.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a475 // ummla v21.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4bd // ummla v29.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x6e80a42a // ummla v10.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4ba // ummla v26.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x6e82a42e // ummla v14.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a476 // ummla v22.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4be // ummla v30.4s, v5.16b, v2.16b\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e80a42b // ummla v11.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bb // ummla v27.4s, v5.16b, v0.16b\n" ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" @@ -2337,48 +2336,48 @@ void a64_hybrid_u8u32_mmla_6x16 ( "blt 170f\n" "169:" // Height 5: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + "ldr d0, [x22], #0x8\n" + "ldr q1, [x10, #0x0]\n" + "trn1 v2.2d, v0.2d, v2.2d\n" + ".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x6e80a48c // ummla v12.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n" "cmp x27, #0x8\n" - ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n" + ".inst 0x6e80a45c // ummla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e81a489 // ummla v9.4s, v4.16b, v1.16b\n" + ".inst 0x6e81a471 // ummla v17.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x6e80a48d // ummla v13.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45d // ummla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e81a48a // ummla v10.4s, v4.16b, v1.16b\n" + ".inst 0x6e81a472 // ummla v18.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45a // ummla v26.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" + ".inst 0x6e80a48e // ummla v14.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45e // ummla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" + ".inst 0x6e86a48b // ummla v11.4s, v4.16b, v6.16b\n" "add x10, x10, #0x80\n" - ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n" + ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a45b // ummla v27.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a48f // ummla v15.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45f // ummla v31.4s, v2.16b, v0.16b\n" "bge 169b\n" "170:" // Height 5: Multiply loop: Skip odd blocks "cbz x27, 175f\n" @@ -2430,42 +2429,42 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr b4, [x23, #0x0]\n" "ldr b5, [x22, #0x0]\n" "174:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + "trn1 v2.2d, v5.2d, v0.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x6e86a4e8 // ummla v8.4s, v7.16b, v6.16b\n" + ".inst 0x6e86a470 // ummla v16.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e81a4ec // ummla v12.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a474 // ummla v20.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e80a4e9 // ummla v9.4s, v7.16b, v0.16b\n" + ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a459 // ummla v25.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x6e81a4ed // ummla v13.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a475 // ummla v21.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45d // ummla v29.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e80a4ea // ummla v10.4s, v7.16b, v0.16b\n" + ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45a // ummla v26.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x6e81a4ee // ummla v14.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a476 // ummla v22.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45e // ummla v30.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n" + ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45b // ummla v27.4s, v2.16b, v0.16b\n" + ".inst 0x6e86a4ef // ummla v15.4s, v7.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a45f // ummla v31.4s, v2.16b, v6.16b\n" "175:" // Height 5: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2872,16 +2871,16 @@ void a64_hybrid_u8u32_mmla_6x16 ( "200:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 201f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 202f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2893,11 +2892,11 @@ void a64_hybrid_u8u32_mmla_6x16 ( "b 202f\n" "201:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "202:" // Height 6: input setup done "cmp x27, #0x10\n" "blt 205f\n" @@ -2964,42 +2963,42 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr q2, [x25, #0x0]\n" "prfm pldl1keep, [x21, #0x80]\n" ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + "ldr q0, [x10, #0x90]\n" "ldr q4, [x23, #0x0]\n" ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xf0]\n" + "ldr q6, [x10, #0xa0]\n" + ".inst 0x6e80a42c // ummla v12.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bc // ummla v28.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xb0]\n" + ".inst 0x6e86a429 // ummla v9.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a471 // ummla v17.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4b9 // ummla v25.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xc0]\n" + ".inst 0x6e80a42d // ummla v13.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bd // ummla v29.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xd0]\n" + ".inst 0x6e86a42a // ummla v10.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a472 // ummla v18.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4ba // ummla v26.4s, v5.16b, v6.16b\n" + "ldr q6, [x10, #0xe0]\n" + ".inst 0x6e80a42e // ummla v14.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4be // ummla v30.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e86a42b // ummla v11.4s, v1.16b, v6.16b\n" + ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a4bb // ummla v27.4s, v5.16b, v6.16b\n" "ldr q7, [x10, #0x0]\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e80a42f // ummla v15.4s, v1.16b, v0.16b\n" "ldr q1, [x26, #0x0]\n" - ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n" "ldr q3, [x24, #0x0]\n" - ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" + ".inst 0x6e80a4bf // ummla v31.4s, v5.16b, v0.16b\n" "ldr q5, [x22, #0x0]\n" "ldr q6, [x21, #0x0]\n" "bge 203b\n" @@ -3055,35 +3054,35 @@ void a64_hybrid_u8u32_mmla_6x16 ( ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x90]\n" + "ldr q2, [x10, #0x90]\n" ".inst 0x6e87a428 // ummla v8.4s, v1.16b, v7.16b\n" ".inst 0x6e87a470 // ummla v16.4s, v3.16b, v7.16b\n" ".inst 0x6e87a4b8 // ummla v24.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e86a42c // ummla v12.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a474 // ummla v20.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bc // ummla v28.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e87a429 // ummla v9.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a471 // ummla v17.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4b9 // ummla v25.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e86a42d // ummla v13.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a475 // ummla v21.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4bd // ummla v29.4s, v5.16b, v6.16b\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e87a42a // ummla v10.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a472 // ummla v18.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4ba // ummla v26.4s, v5.16b, v7.16b\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a476 // ummla v22.4s, v3.16b, v6.16b\n" - ".inst 0x6e86a4be // ummla v30.4s, v5.16b, v6.16b\n" + "ldr q0, [x10, #0xa0]\n" + ".inst 0x6e82a42c // ummla v12.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a474 // ummla v20.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4bc // ummla v28.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xb0]\n" + ".inst 0x6e80a429 // ummla v9.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xc0]\n" + ".inst 0x6e82a42d // ummla v13.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a475 // ummla v21.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4bd // ummla v29.4s, v5.16b, v2.16b\n" + "ldr q2, [x10, #0xd0]\n" + ".inst 0x6e80a42a // ummla v10.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4ba // ummla v26.4s, v5.16b, v0.16b\n" + "ldr q0, [x10, #0xe0]\n" + ".inst 0x6e82a42e // ummla v14.4s, v1.16b, v2.16b\n" + ".inst 0x6e82a476 // ummla v22.4s, v3.16b, v2.16b\n" + ".inst 0x6e82a4be // ummla v30.4s, v5.16b, v2.16b\n" "ldr q6, [x10, #0xf0]\n" "add x10, x10, #0x100\n" - ".inst 0x6e87a42b // ummla v11.4s, v1.16b, v7.16b\n" - ".inst 0x6e87a473 // ummla v19.4s, v3.16b, v7.16b\n" - ".inst 0x6e87a4bb // ummla v27.4s, v5.16b, v7.16b\n" + ".inst 0x6e80a42b // ummla v11.4s, v1.16b, v0.16b\n" + ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a4bb // ummla v27.4s, v5.16b, v0.16b\n" ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" ".inst 0x6e86a4bf // ummla v31.4s, v5.16b, v6.16b\n" @@ -3093,49 +3092,49 @@ void a64_hybrid_u8u32_mmla_6x16 ( "blt 207f\n" "206:" // Height 6: Multiply loop: Odd block loop "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "trn1 v0.2d, v1.2d, v2.2d\n" + "ldr d0, [x25], #0x8\n" + "trn1 v4.2d, v1.2d, v0.2d\n" "sub x27, x27, #0x8\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" + "ldr d1, [x24], #0x8\n" + "ldr d0, [x23], #0x8\n" + "trn1 v3.2d, v1.2d, v0.2d\n" "cmp x27, #0x8\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a450 // ummla v16.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a498 // ummla v24.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49c // ummla v28.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a451 // ummla v17.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a499 // ummla v25.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49d // ummla v29.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a452 // ummla v18.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49a // ummla v26.4s, v4.16b, v6.16b\n" + "ldr d1, [x22], #0x8\n" + "ldr d0, [x21], #0x8\n" + "trn1 v2.2d, v1.2d, v0.2d\n" + "ldr q1, [x10, #0x0]\n" + "ldr q0, [x10, #0x10]\n" + ".inst 0x6e81a488 // ummla v8.4s, v4.16b, v1.16b\n" + ".inst 0x6e81a470 // ummla v16.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a458 // ummla v24.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x20]\n" + ".inst 0x6e80a48c // ummla v12.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a474 // ummla v20.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45c // ummla v28.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x30]\n" + ".inst 0x6e81a489 // ummla v9.4s, v4.16b, v1.16b\n" + ".inst 0x6e81a471 // ummla v17.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a459 // ummla v25.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x40]\n" + ".inst 0x6e80a48d // ummla v13.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a475 // ummla v21.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45d // ummla v29.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x50]\n" + ".inst 0x6e81a48a // ummla v10.4s, v4.16b, v1.16b\n" + ".inst 0x6e81a472 // ummla v18.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45a // ummla v26.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x60]\n" - ".inst 0x6e87a40e // ummla v14.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49e // ummla v30.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x70]\n" + ".inst 0x6e80a48e // ummla v14.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a476 // ummla v22.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45e // ummla v30.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x70]\n" "add x10, x10, #0x80\n" - ".inst 0x6e86a40b // ummla v11.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a453 // ummla v19.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49b // ummla v27.4s, v4.16b, v6.16b\n" - ".inst 0x6e87a40f // ummla v15.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49f // ummla v31.4s, v4.16b, v7.16b\n" + ".inst 0x6e86a48b // ummla v11.4s, v4.16b, v6.16b\n" + ".inst 0x6e86a473 // ummla v19.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a45b // ummla v27.4s, v2.16b, v6.16b\n" + ".inst 0x6e80a48f // ummla v15.4s, v4.16b, v0.16b\n" + ".inst 0x6e80a477 // ummla v23.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45f // ummla v31.4s, v2.16b, v0.16b\n" "bge 206b\n" "207:" // Height 6: Multiply loop: Skip odd blocks "cbz x27, 212f\n" @@ -3194,42 +3193,42 @@ void a64_hybrid_u8u32_mmla_6x16 ( "ldr b5, [x22, #0x0]\n" "ldr b6, [x21, #0x0]\n" "211:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e87a450 // ummla v16.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a498 // ummla v24.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49c // ummla v28.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a451 // ummla v17.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a499 // ummla v25.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e86a40d // ummla v13.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49d // ummla v29.4s, v4.16b, v6.16b\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" - ".inst 0x6e87a452 // ummla v18.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49a // ummla v26.4s, v4.16b, v7.16b\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e86a40e // ummla v14.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49e // ummla v30.4s, v4.16b, v6.16b\n" + "ldr q0, [x10, #0x0]\n" + "trn1 v7.2d, v1.2d, v2.2d\n" + "trn1 v3.2d, v3.2d, v4.2d\n" + ".inst 0x6e80a4e8 // ummla v8.4s, v7.16b, v0.16b\n" + "trn1 v2.2d, v5.2d, v6.2d\n" + "ldr q1, [x10, #0x10]\n" + ".inst 0x6e80a470 // ummla v16.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a458 // ummla v24.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x20]\n" + ".inst 0x6e81a4ec // ummla v12.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a474 // ummla v20.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45c // ummla v28.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x30]\n" + ".inst 0x6e80a4e9 // ummla v9.4s, v7.16b, v0.16b\n" + ".inst 0x6e80a471 // ummla v17.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a459 // ummla v25.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x40]\n" + ".inst 0x6e81a4ed // ummla v13.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a475 // ummla v21.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45d // ummla v29.4s, v2.16b, v1.16b\n" + "ldr q1, [x10, #0x50]\n" + ".inst 0x6e80a4ea // ummla v10.4s, v7.16b, v0.16b\n" + ".inst 0x6e80a472 // ummla v18.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45a // ummla v26.4s, v2.16b, v0.16b\n" + "ldr q0, [x10, #0x60]\n" + ".inst 0x6e81a4ee // ummla v14.4s, v7.16b, v1.16b\n" + ".inst 0x6e81a476 // ummla v22.4s, v3.16b, v1.16b\n" + ".inst 0x6e81a45e // ummla v30.4s, v2.16b, v1.16b\n" "ldr q6, [x10, #0x70]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n" "add x10, x10, #0x80\n" - ".inst 0x6e87a453 // ummla v19.4s, v2.16b, v7.16b\n" - ".inst 0x6e87a49b // ummla v27.4s, v4.16b, v7.16b\n" - ".inst 0x6e86a40f // ummla v15.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a457 // ummla v23.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a49f // ummla v31.4s, v4.16b, v6.16b\n" + ".inst 0x6e80a473 // ummla v19.4s, v3.16b, v0.16b\n" + ".inst 0x6e80a45b // ummla v27.4s, v2.16b, v0.16b\n" + ".inst 0x6e86a4ef // ummla v15.4s, v7.16b, v6.16b\n" + ".inst 0x6e86a477 // ummla v23.4s, v3.16b, v6.16b\n" + ".inst 0x6e86a45f // ummla v31.4s, v2.16b, v6.16b\n" "212:" // Height 6: Multiply loop: No odd multiplies "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3440,7 +3439,6 @@ void a64_hybrid_u8u32_mmla_6x16 ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "224:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp index 153a4cc167..25c5bf1b44 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return 12; } - static unsigned int stripe_width() - { - return 4; - } - static constexpr unsigned int k_unroll() { return 2; @@ -97,5 +92,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp index b3bde74635..5684f464b6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_bf16fp32_dot_8x12( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_dot_8x12( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -88,8 +91,8 @@ void a64_interleaved_bf16fp32_dot_8x12( "movi v31.16b, #0x0\n" "blt 4f\n" "3:" // main loop head - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" + "ldr q3, [%x[Apanel], #0x20]\n" + "ldr q7, [%x[Apanel], #0x30]\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" ".inst 0x4f60f08b // bfdot v11.4s, v4.8h, v0.h[1]\n" ".inst 0x4f40f88e // bfdot v14.4s, v4.8h, v0.h[2]\n" @@ -123,35 +126,35 @@ void a64_interleaved_bf16fp32_dot_8x12( ".inst 0x4f61f0d9 // bfdot v25.4s, v6.8h, v1.h[1]\n" ".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" - "ldr q6, [x22, #0x50]\n" + "ldr q2, [x22, #0x50]\n" "ldr q1, [%x[Apanel], #0x10]\n" "add x22, x22, #0x60\n" - ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n" - ".inst 0x4f62f08b // bfdot v11.4s, v4.8h, v2.h[1]\n" - ".inst 0x4f42f88e // bfdot v14.4s, v4.8h, v2.h[2]\n" - ".inst 0x4f62f891 // bfdot v17.4s, v4.8h, v2.h[3]\n" - ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" - ".inst 0x4f63f097 // bfdot v23.4s, v4.8h, v3.h[1]\n" - ".inst 0x4f43f89a // bfdot v26.4s, v4.8h, v3.h[2]\n" - ".inst 0x4f63f89d // bfdot v29.4s, v4.8h, v3.h[3]\n" + ".inst 0x4f43f088 // bfdot v8.4s, v4.8h, v3.h[0]\n" + ".inst 0x4f63f08b // bfdot v11.4s, v4.8h, v3.h[1]\n" + ".inst 0x4f43f88e // bfdot v14.4s, v4.8h, v3.h[2]\n" + ".inst 0x4f63f891 // bfdot v17.4s, v4.8h, v3.h[3]\n" + ".inst 0x4f47f094 // bfdot v20.4s, v4.8h, v7.h[0]\n" + ".inst 0x4f67f097 // bfdot v23.4s, v4.8h, v7.h[1]\n" + ".inst 0x4f47f89a // bfdot v26.4s, v4.8h, v7.h[2]\n" + ".inst 0x4f67f89d // bfdot v29.4s, v4.8h, v7.h[3]\n" "ldr q4, [x22, #0x0]\n" - ".inst 0x4f42f0a9 // bfdot v9.4s, v5.8h, v2.h[0]\n" - ".inst 0x4f62f0ac // bfdot v12.4s, v5.8h, v2.h[1]\n" - ".inst 0x4f42f8af // bfdot v15.4s, v5.8h, v2.h[2]\n" - ".inst 0x4f62f8b2 // bfdot v18.4s, v5.8h, v2.h[3]\n" - ".inst 0x4f43f0b5 // bfdot v21.4s, v5.8h, v3.h[0]\n" - ".inst 0x4f63f0b8 // bfdot v24.4s, v5.8h, v3.h[1]\n" - ".inst 0x4f43f8bb // bfdot v27.4s, v5.8h, v3.h[2]\n" - ".inst 0x4f63f8be // bfdot v30.4s, v5.8h, v3.h[3]\n" + ".inst 0x4f43f0a9 // bfdot v9.4s, v5.8h, v3.h[0]\n" + ".inst 0x4f63f0ac // bfdot v12.4s, v5.8h, v3.h[1]\n" + ".inst 0x4f43f8af // bfdot v15.4s, v5.8h, v3.h[2]\n" + ".inst 0x4f63f8b2 // bfdot v18.4s, v5.8h, v3.h[3]\n" + ".inst 0x4f47f0b5 // bfdot v21.4s, v5.8h, v7.h[0]\n" + ".inst 0x4f67f0b8 // bfdot v24.4s, v5.8h, v7.h[1]\n" + ".inst 0x4f47f8bb // bfdot v27.4s, v5.8h, v7.h[2]\n" + ".inst 0x4f67f8be // bfdot v30.4s, v5.8h, v7.h[3]\n" "ldr q5, [x22, #0x10]\n" - ".inst 0x4f42f0ca // bfdot v10.4s, v6.8h, v2.h[0]\n" - ".inst 0x4f62f0cd // bfdot v13.4s, v6.8h, v2.h[1]\n" - ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n" - ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n" - ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n" - ".inst 0x4f63f0d9 // bfdot v25.4s, v6.8h, v3.h[1]\n" - ".inst 0x4f43f8dc // bfdot v28.4s, v6.8h, v3.h[2]\n" - ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" + ".inst 0x4f43f04a // bfdot v10.4s, v2.8h, v3.h[0]\n" + ".inst 0x4f63f04d // bfdot v13.4s, v2.8h, v3.h[1]\n" + ".inst 0x4f43f850 // bfdot v16.4s, v2.8h, v3.h[2]\n" + ".inst 0x4f63f853 // bfdot v19.4s, v2.8h, v3.h[3]\n" + ".inst 0x4f47f056 // bfdot v22.4s, v2.8h, v7.h[0]\n" + ".inst 0x4f67f059 // bfdot v25.4s, v2.8h, v7.h[1]\n" + ".inst 0x4f47f85c // bfdot v28.4s, v2.8h, v7.h[2]\n" + ".inst 0x4f67f85f // bfdot v31.4s, v2.8h, v7.h[3]\n" "ldr q6, [x22, #0x20]\n" "bge 3b\n" "4:" // main loop skip @@ -182,37 +185,37 @@ void a64_interleaved_bf16fp32_dot_8x12( ".inst 0x4f41f8dc // bfdot v28.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" "cbz x20, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" + "ldr q4, [%x[Apanel], #0x0]\n" + "ldr q3, [%x[Apanel], #0x10]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x22, #0x0]\n" - "ldr q4, [x22, #0x10]\n" - ".inst 0x4f40f0e8 // bfdot v8.4s, v7.8h, v0.h[0]\n" - "ldr q5, [x22, #0x20]\n" - ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n" - ".inst 0x4f40f8ee // bfdot v14.4s, v7.8h, v0.h[2]\n" - ".inst 0x4f60f8f1 // bfdot v17.4s, v7.8h, v0.h[3]\n" - ".inst 0x4f41f0f4 // bfdot v20.4s, v7.8h, v1.h[0]\n" + "ldr q2, [x22, #0x0]\n" + "ldr q1, [x22, #0x10]\n" + ".inst 0x4f44f048 // bfdot v8.4s, v2.8h, v4.h[0]\n" + "ldr q0, [x22, #0x20]\n" + ".inst 0x4f64f04b // bfdot v11.4s, v2.8h, v4.h[1]\n" + ".inst 0x4f44f84e // bfdot v14.4s, v2.8h, v4.h[2]\n" + ".inst 0x4f64f851 // bfdot v17.4s, v2.8h, v4.h[3]\n" + ".inst 0x4f43f054 // bfdot v20.4s, v2.8h, v3.h[0]\n" "add x22, x22, #0x30\n" - ".inst 0x4f61f0f7 // bfdot v23.4s, v7.8h, v1.h[1]\n" - ".inst 0x4f41f8fa // bfdot v26.4s, v7.8h, v1.h[2]\n" - ".inst 0x4f61f8fd // bfdot v29.4s, v7.8h, v1.h[3]\n" - ".inst 0x4f40f089 // bfdot v9.4s, v4.8h, v0.h[0]\n" - ".inst 0x4f60f08c // bfdot v12.4s, v4.8h, v0.h[1]\n" - ".inst 0x4f40f88f // bfdot v15.4s, v4.8h, v0.h[2]\n" - ".inst 0x4f60f892 // bfdot v18.4s, v4.8h, v0.h[3]\n" - ".inst 0x4f41f095 // bfdot v21.4s, v4.8h, v1.h[0]\n" - ".inst 0x4f61f098 // bfdot v24.4s, v4.8h, v1.h[1]\n" - ".inst 0x4f41f89b // bfdot v27.4s, v4.8h, v1.h[2]\n" - ".inst 0x4f61f89e // bfdot v30.4s, v4.8h, v1.h[3]\n" - ".inst 0x4f40f0aa // bfdot v10.4s, v5.8h, v0.h[0]\n" - ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n" - ".inst 0x4f40f8b0 // bfdot v16.4s, v5.8h, v0.h[2]\n" - ".inst 0x4f60f8b3 // bfdot v19.4s, v5.8h, v0.h[3]\n" - ".inst 0x4f41f0b6 // bfdot v22.4s, v5.8h, v1.h[0]\n" - ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n" - ".inst 0x4f41f8bc // bfdot v28.4s, v5.8h, v1.h[2]\n" - ".inst 0x4f61f8bf // bfdot v31.4s, v5.8h, v1.h[3]\n" + ".inst 0x4f63f057 // bfdot v23.4s, v2.8h, v3.h[1]\n" + ".inst 0x4f43f85a // bfdot v26.4s, v2.8h, v3.h[2]\n" + ".inst 0x4f63f85d // bfdot v29.4s, v2.8h, v3.h[3]\n" + ".inst 0x4f44f029 // bfdot v9.4s, v1.8h, v4.h[0]\n" + ".inst 0x4f64f02c // bfdot v12.4s, v1.8h, v4.h[1]\n" + ".inst 0x4f44f82f // bfdot v15.4s, v1.8h, v4.h[2]\n" + ".inst 0x4f64f832 // bfdot v18.4s, v1.8h, v4.h[3]\n" + ".inst 0x4f43f035 // bfdot v21.4s, v1.8h, v3.h[0]\n" + ".inst 0x4f63f038 // bfdot v24.4s, v1.8h, v3.h[1]\n" + ".inst 0x4f43f83b // bfdot v27.4s, v1.8h, v3.h[2]\n" + ".inst 0x4f63f83e // bfdot v30.4s, v1.8h, v3.h[3]\n" + ".inst 0x4f44f00a // bfdot v10.4s, v0.8h, v4.h[0]\n" + ".inst 0x4f64f00d // bfdot v13.4s, v0.8h, v4.h[1]\n" + ".inst 0x4f44f810 // bfdot v16.4s, v0.8h, v4.h[2]\n" + ".inst 0x4f64f813 // bfdot v19.4s, v0.8h, v4.h[3]\n" + ".inst 0x4f43f016 // bfdot v22.4s, v0.8h, v3.h[0]\n" + ".inst 0x4f63f019 // bfdot v25.4s, v0.8h, v3.h[1]\n" + ".inst 0x4f43f81c // bfdot v28.4s, v0.8h, v3.h[2]\n" + ".inst 0x4f63f81f // bfdot v31.4s, v0.8h, v3.h[3]\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" "str q8, [%x[Cpanel], #0x0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp index 17c93faca2..66c2b92a34 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -57,11 +57,6 @@ public: return 12; } - static unsigned int stripe_width() - { - return 4; - } - static constexpr unsigned int k_unroll() { return 4; @@ -117,5 +112,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp index cba29bc572..bab687a9b4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_bf16fp32_mmla_8x12_a510( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -82,28 +85,28 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( "movi v31.16b, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v6.8h }, [%x[Apanel]], #0x10\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n" - "ldp q6, q7, [x22], #0x20\n" + "ldp q3, q7, [x22], #0x20\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" "sub x20, x20, #0x2\n" ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n" "cmp x20, #0x2\n" - ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" + ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e43ec09 // bfmmla v9.4s, v0.8h, v3.8h\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e43ec2f // bfmmla v15.4s, v1.8h, v3.8h\n" ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e43ec55 // bfmmla v21.4s, v2.8h, v3.8h\n" ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x6e43ecdb // bfmmla v27.4s, v6.8h, v3.8h\n" + ".inst 0x6e47ecde // bfmmla v30.4s, v6.8h, v7.8h\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" @@ -113,39 +116,39 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" - "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n" + ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n" + "ld1 { v6.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n" + ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" + ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n" + ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n" "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n" + ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n" "bge 3b\n" "4:" // main loop skip "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" @@ -158,7 +161,7 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" - "ldp q4, q5, [x22], #0x20\n" + "ldp q5, q4, [x22], #0x20\n" ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" @@ -167,93 +170,93 @@ void a64_interleaved_bf16fp32_mmla_8x12_a510( ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" - ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" - ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n" - ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n" - ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" + ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" + ".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n" + ".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n" + ".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n" + ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" + ".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n" + ".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n" + ".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n" "cbz x20, 5f\n" - "ldp q6, q7, [x22], #0x20\n" - "ld1 { v0.8h }, [%x[Apanel]], #0x10\n" - "ld1 { v1.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ld1 { v2.8h }, [%x[Apanel]], #0x10\n" - "ld1 { v3.8h }, [%x[Apanel]], #0x10\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldp q6, q7, [x22], #0x20\n" - ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" - ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" - ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" - ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + "ldp q1, q0, [x22], #0x20\n" + "ld1 { v7.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v6.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n" + "ld1 { v5.8h }, [%x[Apanel]], #0x10\n" + "ld1 { v4.8h }, [%x[Apanel]], #0x10\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" + "ldp q3, q2, [x22], #0x20\n" + ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n" + ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n" + ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldp q1, q0, [x22], #0x20\n" + ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n" + ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n" + ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n" + ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n" + ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n" + ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n" + ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n" + ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n" + ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n" + ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp index 2938639048..8485820c7c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_bf16fp32_mmla_8x12( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_bf16fp32_mmla_8x12( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -85,31 +88,31 @@ void a64_interleaved_bf16fp32_mmla_8x12( "movi v31.16b, #0x0\n" "blt 4f\n" "3:" // main loop head - "ldr q3, [%x[Apanel], #0x0]\n" - "ldr q6, [x22, #0x0]\n" + "ldr q6, [%x[Apanel], #0x0]\n" + "ldr q7, [x22, #0x0]\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "ldr q7, [x22, #0x10]\n" + "ldr q3, [x22, #0x10]\n" ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" "sub x20, x20, #0x2\n" ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecda // bfmmla v26.4s, v6.8h, v4.8h\n" "ldr q4, [x22, #0x20]\n" - ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n" + ".inst 0x6e45ecdd // bfmmla v29.4s, v6.8h, v5.8h\n" "ldr q5, [x22, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0c // bfmmla v12.4s, v0.8h, v3.8h\n" + ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n" "cmp x20, #0x2\n" - ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n" - "ldr q7, [x22, #0x50]\n" + ".inst 0x6e43ec32 // bfmmla v18.4s, v1.8h, v3.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec58 // bfmmla v24.4s, v2.8h, v3.8h\n" + ".inst 0x6e47ecdb // bfmmla v27.4s, v6.8h, v7.8h\n" + "ldr q7, [x22, #0x40]\n" + ".inst 0x6e43ecde // bfmmla v30.4s, v6.8h, v3.8h\n" + "ldr q3, [x22, #0x50]\n" ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n" "ldr q0, [%x[Apanel], #0x10]\n" @@ -119,42 +122,42 @@ void a64_interleaved_bf16fp32_mmla_8x12( ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n" "ldr q2, [%x[Apanel], #0x30]\n" - ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecdc // bfmmla v28.4s, v6.8h, v4.8h\n" "ldr q4, [x22, #0x60]\n" - ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" - "ldr q3, [%x[Apanel], #0x40]\n" + ".inst 0x6e45ecdf // bfmmla v31.4s, v6.8h, v5.8h\n" + "ldr q6, [%x[Apanel], #0x40]\n" "ldr q5, [x22, #0x70]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - "ldr q6, [x22, #0x80]\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldr q7, [x22, #0x90]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0b // bfmmla v11.4s, v0.8h, v3.8h\n" + ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec31 // bfmmla v17.4s, v1.8h, v3.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec57 // bfmmla v23.4s, v2.8h, v3.8h\n" + ".inst 0x6e47ecda // bfmmla v26.4s, v6.8h, v7.8h\n" + "ldr q7, [x22, #0x80]\n" + ".inst 0x6e43ecdd // bfmmla v29.4s, v6.8h, v3.8h\n" + "ldr q3, [x22, #0x90]\n" ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" + ".inst 0x6e44ecdb // bfmmla v27.4s, v6.8h, v4.8h\n" "ldr q4, [x22, #0xa0]\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" + ".inst 0x6e45ecde // bfmmla v30.4s, v6.8h, v5.8h\n" "ldr q5, [x22, #0xb0]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e43ec0d // bfmmla v13.4s, v0.8h, v3.8h\n" "ldr q0, [%x[Apanel], #0x50]\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n" + ".inst 0x6e43ec33 // bfmmla v19.4s, v1.8h, v3.8h\n" "ldr q1, [%x[Apanel], #0x60]\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e43ec59 // bfmmla v25.4s, v2.8h, v3.8h\n" "ldr q2, [%x[Apanel], #0x70]\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecdc // bfmmla v28.4s, v6.8h, v7.8h\n" + ".inst 0x6e43ecdf // bfmmla v31.4s, v6.8h, v3.8h\n" "add %x[Apanel], %x[Apanel], #0x80\n" "add x22, x22, #0xc0\n" "bge 3b\n" @@ -191,89 +194,89 @@ void a64_interleaved_bf16fp32_mmla_8x12( ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n" ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n" "cbz x20, 5f\n" - "ldr q6, [x22, #0x0]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q7, [x22, #0x10]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q4, [x22, #0x20]\n" - "ldr q5, [x22, #0x30]\n" - ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "ldr q1, [x22, #0x0]\n" + "ldr q7, [%x[Apanel], #0x0]\n" + ".inst 0x6e41ece8 // bfmmla v8.4s, v7.8h, v1.8h\n" + "ldr q6, [%x[Apanel], #0x10]\n" + "ldr q0, [x22, #0x10]\n" + ".inst 0x6e40eceb // bfmmla v11.4s, v7.8h, v0.8h\n" + "ldr q5, [%x[Apanel], #0x20]\n" + "ldr q4, [%x[Apanel], #0x30]\n" + ".inst 0x6e41ecce // bfmmla v14.4s, v6.8h, v1.8h\n" + "ldr q3, [x22, #0x20]\n" + "ldr q2, [x22, #0x30]\n" + ".inst 0x6e40ecd1 // bfmmla v17.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb4 // bfmmla v20.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb7 // bfmmla v23.4s, v5.8h, v0.8h\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n" - "ldr q7, [x22, #0x50]\n" - ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n" - ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n" - ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n" - ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n" + ".inst 0x6e41ec9a // bfmmla v26.4s, v4.8h, v1.8h\n" + "ldr q1, [x22, #0x40]\n" + ".inst 0x6e40ec9d // bfmmla v29.4s, v4.8h, v0.8h\n" + "ldr q0, [x22, #0x50]\n" + ".inst 0x6e43ece9 // bfmmla v9.4s, v7.8h, v3.8h\n" + ".inst 0x6e42ecec // bfmmla v12.4s, v7.8h, v2.8h\n" + ".inst 0x6e43eccf // bfmmla v15.4s, v6.8h, v3.8h\n" + ".inst 0x6e42ecd2 // bfmmla v18.4s, v6.8h, v2.8h\n" "add x22, x22, #0x60\n" - ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n" - ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n" - ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n" - ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" - ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" - ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" + ".inst 0x6e43ecb5 // bfmmla v21.4s, v5.8h, v3.8h\n" + ".inst 0x6e42ecb8 // bfmmla v24.4s, v5.8h, v2.8h\n" + ".inst 0x6e43ec9b // bfmmla v27.4s, v4.8h, v3.8h\n" + ".inst 0x6e42ec9e // bfmmla v30.4s, v4.8h, v2.8h\n" + ".inst 0x6e41ecea // bfmmla v10.4s, v7.8h, v1.8h\n" + ".inst 0x6e40eced // bfmmla v13.4s, v7.8h, v0.8h\n" + ".inst 0x6e41ecd0 // bfmmla v16.4s, v6.8h, v1.8h\n" + ".inst 0x6e40ecd3 // bfmmla v19.4s, v6.8h, v0.8h\n" + ".inst 0x6e41ecb6 // bfmmla v22.4s, v5.8h, v1.8h\n" + ".inst 0x6e40ecb9 // bfmmla v25.4s, v5.8h, v0.8h\n" + ".inst 0x6e41ec9c // bfmmla v28.4s, v4.8h, v1.8h\n" + ".inst 0x6e40ec9f // bfmmla v31.4s, v4.8h, v0.8h\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp index 4cc3ed040a..37a54fcfab 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return 12; } - static unsigned int stripe_width() - { - return 4; - } - static constexpr unsigned int k_unroll() { return 8; @@ -111,11 +106,9 @@ public: break; } } - }; } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp index e46cb8a67a..c1d37383df 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_s8s32_mmla_8x12_a510( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *Apanel, + const int8_t *Bpanel, + int32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_s8s32_mmla_8x12_a510( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -82,28 +85,28 @@ void a64_interleaved_s8s32_mmla_8x12_a510( "movi v31.4s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n" - "ldp q6, q7, [x22], #0x20\n" + "ldp q3, q7, [x22], #0x20\n" ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n" ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" "sub x20, x20, #0x2\n" ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" + ".inst 0x4e84a4da // smmla v26.4s, v6.16b, v4.16b\n" "cmp x20, #0x2\n" - ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n" + ".inst 0x4e85a4dd // smmla v29.4s, v6.16b, v5.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" + ".inst 0x4e83a409 // smmla v9.4s, v0.16b, v3.16b\n" ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e83a42f // smmla v15.4s, v1.16b, v3.16b\n" ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" + ".inst 0x4e83a455 // smmla v21.4s, v2.16b, v3.16b\n" ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x4e83a4db // smmla v27.4s, v6.16b, v3.16b\n" + ".inst 0x4e87a4de // smmla v30.4s, v6.16b, v7.16b\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n" ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n" "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" @@ -113,39 +116,39 @@ void a64_interleaved_s8s32_mmla_8x12_a510( ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n" ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" - ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" + ".inst 0x4e84a4dc // smmla v28.4s, v6.16b, v4.16b\n" + ".inst 0x4e85a4df // smmla v31.4s, v6.16b, v5.16b\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x4e83a40b // smmla v11.4s, v0.16b, v3.16b\n" + ".inst 0x4e87a42e // smmla v14.4s, v1.16b, v7.16b\n" + ".inst 0x4e83a431 // smmla v17.4s, v1.16b, v3.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e83a457 // smmla v23.4s, v2.16b, v3.16b\n" + ".inst 0x4e87a4da // smmla v26.4s, v6.16b, v7.16b\n" + ".inst 0x4e83a4dd // smmla v29.4s, v6.16b, v3.16b\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" - ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" + ".inst 0x4e84a4db // smmla v27.4s, v6.16b, v4.16b\n" + ".inst 0x4e85a4de // smmla v30.4s, v6.16b, v5.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e83a40d // smmla v13.4s, v0.16b, v3.16b\n" "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a430 // smmla v16.4s, v1.16b, v7.16b\n" + ".inst 0x4e83a433 // smmla v19.4s, v1.16b, v3.16b\n" "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e83a459 // smmla v25.4s, v2.16b, v3.16b\n" "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n" + ".inst 0x4e83a4df // smmla v31.4s, v6.16b, v3.16b\n" "bge 3b\n" "4:" // main loop skip "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" @@ -158,7 +161,7 @@ void a64_interleaved_s8s32_mmla_8x12_a510( ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n" ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n" - "ldp q4, q5, [x22], #0x20\n" + "ldp q5, q4, [x22], #0x20\n" ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" @@ -167,93 +170,93 @@ void a64_interleaved_s8s32_mmla_8x12_a510( ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n" ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n" - ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n" - ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n" - ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n" - ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n" - ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" - ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" + ".inst 0x4e85a40a // smmla v10.4s, v0.16b, v5.16b\n" + ".inst 0x4e84a40d // smmla v13.4s, v0.16b, v4.16b\n" + ".inst 0x4e85a430 // smmla v16.4s, v1.16b, v5.16b\n" + ".inst 0x4e84a433 // smmla v19.4s, v1.16b, v4.16b\n" + ".inst 0x4e85a456 // smmla v22.4s, v2.16b, v5.16b\n" + ".inst 0x4e84a459 // smmla v25.4s, v2.16b, v4.16b\n" + ".inst 0x4e85a47c // smmla v28.4s, v3.16b, v5.16b\n" + ".inst 0x4e84a47f // smmla v31.4s, v3.16b, v4.16b\n" "cbz x20, 5f\n" - "ldp q6, q7, [x22], #0x20\n" - "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" - "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldp q4, q5, [x22], #0x20\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" - ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" - ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" - ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" - ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" - ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" - ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + "ldp q1, q0, [x22], #0x20\n" + "ld1 { v7.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n" + "ld1 { v5.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v4.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n" + "ldp q3, q2, [x22], #0x20\n" + ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n" + ".inst 0x4e80a4d1 // smmla v17.4s, v6.16b, v0.16b\n" + ".inst 0x4e81a4b4 // smmla v20.4s, v5.16b, v1.16b\n" + ".inst 0x4e80a4b7 // smmla v23.4s, v5.16b, v0.16b\n" + ".inst 0x4e81a49a // smmla v26.4s, v4.16b, v1.16b\n" + ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n" + "ldp q1, q0, [x22], #0x20\n" + ".inst 0x4e83a4e9 // smmla v9.4s, v7.16b, v3.16b\n" + ".inst 0x4e82a4ec // smmla v12.4s, v7.16b, v2.16b\n" + ".inst 0x4e83a4cf // smmla v15.4s, v6.16b, v3.16b\n" + ".inst 0x4e82a4d2 // smmla v18.4s, v6.16b, v2.16b\n" + ".inst 0x4e83a4b5 // smmla v21.4s, v5.16b, v3.16b\n" + ".inst 0x4e82a4b8 // smmla v24.4s, v5.16b, v2.16b\n" + ".inst 0x4e83a49b // smmla v27.4s, v4.16b, v3.16b\n" + ".inst 0x4e82a49e // smmla v30.4s, v4.16b, v2.16b\n" + ".inst 0x4e81a4ea // smmla v10.4s, v7.16b, v1.16b\n" + ".inst 0x4e80a4ed // smmla v13.4s, v7.16b, v0.16b\n" + ".inst 0x4e81a4d0 // smmla v16.4s, v6.16b, v1.16b\n" + ".inst 0x4e80a4d3 // smmla v19.4s, v6.16b, v0.16b\n" + ".inst 0x4e81a4b6 // smmla v22.4s, v5.16b, v1.16b\n" + ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n" + ".inst 0x4e81a49c // smmla v28.4s, v4.16b, v1.16b\n" + ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp index fc20c2fc9d..a097dc358a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_s8s32_mmla_8x12( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *Apanel, + const int8_t *Bpanel, + int32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_s8s32_mmla_8x12( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -85,31 +88,31 @@ void a64_interleaved_s8s32_mmla_8x12( "movi v31.4s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ldr q3, [%x[Apanel], #0x0]\n" - "ldr q6, [x22, #0x0]\n" + "ldr q6, [%x[Apanel], #0x0]\n" + "ldr q7, [x22, #0x0]\n" ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" - "ldr q7, [x22, #0x10]\n" + "ldr q3, [x22, #0x10]\n" ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n" ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n" ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" "sub x20, x20, #0x2\n" ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" + ".inst 0x4e84a4da // smmla v26.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0x20]\n" - ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n" + ".inst 0x4e85a4dd // smmla v29.4s, v6.16b, v5.16b\n" "ldr q5, [x22, #0x30]\n" - ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n" + ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n" + ".inst 0x4e83a40c // smmla v12.4s, v0.16b, v3.16b\n" + ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n" "cmp x20, #0x2\n" - ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x50]\n" + ".inst 0x4e83a432 // smmla v18.4s, v1.16b, v3.16b\n" + ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n" + ".inst 0x4e83a458 // smmla v24.4s, v2.16b, v3.16b\n" + ".inst 0x4e87a4db // smmla v27.4s, v6.16b, v7.16b\n" + "ldr q7, [x22, #0x40]\n" + ".inst 0x4e83a4de // smmla v30.4s, v6.16b, v3.16b\n" + "ldr q3, [x22, #0x50]\n" ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n" ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n" "ldr q0, [%x[Apanel], #0x10]\n" @@ -119,42 +122,42 @@ void a64_interleaved_s8s32_mmla_8x12( ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n" ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n" "ldr q2, [%x[Apanel], #0x30]\n" - ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" + ".inst 0x4e84a4dc // smmla v28.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0x60]\n" - ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" - "ldr q3, [%x[Apanel], #0x40]\n" + ".inst 0x4e85a4df // smmla v31.4s, v6.16b, v5.16b\n" + "ldr q6, [%x[Apanel], #0x40]\n" "ldr q5, [x22, #0x70]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x80]\n" - ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x90]\n" + ".inst 0x4e87a408 // smmla v8.4s, v0.16b, v7.16b\n" + ".inst 0x4e83a40b // smmla v11.4s, v0.16b, v3.16b\n" + ".inst 0x4e87a42e // smmla v14.4s, v1.16b, v7.16b\n" + ".inst 0x4e83a431 // smmla v17.4s, v1.16b, v3.16b\n" + ".inst 0x4e87a454 // smmla v20.4s, v2.16b, v7.16b\n" + ".inst 0x4e83a457 // smmla v23.4s, v2.16b, v3.16b\n" + ".inst 0x4e87a4da // smmla v26.4s, v6.16b, v7.16b\n" + "ldr q7, [x22, #0x80]\n" + ".inst 0x4e83a4dd // smmla v29.4s, v6.16b, v3.16b\n" + "ldr q3, [x22, #0x90]\n" ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" + ".inst 0x4e84a4db // smmla v27.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0xa0]\n" - ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" + ".inst 0x4e85a4de // smmla v30.4s, v6.16b, v5.16b\n" "ldr q5, [x22, #0xb0]\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" + ".inst 0x4e87a40a // smmla v10.4s, v0.16b, v7.16b\n" + ".inst 0x4e83a40d // smmla v13.4s, v0.16b, v3.16b\n" "ldr q0, [%x[Apanel], #0x50]\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" + ".inst 0x4e87a430 // smmla v16.4s, v1.16b, v7.16b\n" + ".inst 0x4e83a433 // smmla v19.4s, v1.16b, v3.16b\n" "ldr q1, [%x[Apanel], #0x60]\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" + ".inst 0x4e87a456 // smmla v22.4s, v2.16b, v7.16b\n" + ".inst 0x4e83a459 // smmla v25.4s, v2.16b, v3.16b\n" "ldr q2, [%x[Apanel], #0x70]\n" - ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + ".inst 0x4e87a4dc // smmla v28.4s, v6.16b, v7.16b\n" + ".inst 0x4e83a4df // smmla v31.4s, v6.16b, v3.16b\n" "add %x[Apanel], %x[Apanel], #0x80\n" "add x22, x22, #0xc0\n" "bge 3b\n" @@ -191,89 +194,89 @@ void a64_interleaved_s8s32_mmla_8x12( ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n" ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n" "cbz x20, 5f\n" - "ldr q6, [x22, #0x0]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q7, [x22, #0x10]\n" - ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n" - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n" - "ldr q4, [x22, #0x20]\n" - "ldr q5, [x22, #0x30]\n" - ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n" + "ldr q1, [x22, #0x0]\n" + "ldr q7, [%x[Apanel], #0x0]\n" + ".inst 0x4e81a4e8 // smmla v8.4s, v7.16b, v1.16b\n" + "ldr q6, [%x[Apanel], #0x10]\n" + "ldr q0, [x22, #0x10]\n" + ".inst 0x4e80a4eb // smmla v11.4s, v7.16b, v0.16b\n" + "ldr q5, [%x[Apanel], #0x20]\n" + "ldr q4, [%x[Apanel], #0x30]\n" + ".inst 0x4e81a4ce // smmla v14.4s, v6.16b, v1.16b\n" + "ldr q3, [x22, #0x20]\n" + "ldr q2, [x22, #0x30]\n" + ".inst 0x4e80a4d1 // smmla v17.4s, v6.16b, v0.16b\n" + ".inst 0x4e81a4b4 // smmla v20.4s, v5.16b, v1.16b\n" + ".inst 0x4e80a4b7 // smmla v23.4s, v5.16b, v0.16b\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x50]\n" - ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n" - ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n" - ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n" - ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n" + ".inst 0x4e81a49a // smmla v26.4s, v4.16b, v1.16b\n" + "ldr q1, [x22, #0x40]\n" + ".inst 0x4e80a49d // smmla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x22, #0x50]\n" + ".inst 0x4e83a4e9 // smmla v9.4s, v7.16b, v3.16b\n" + ".inst 0x4e82a4ec // smmla v12.4s, v7.16b, v2.16b\n" + ".inst 0x4e83a4cf // smmla v15.4s, v6.16b, v3.16b\n" + ".inst 0x4e82a4d2 // smmla v18.4s, v6.16b, v2.16b\n" "add x22, x22, #0x60\n" - ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n" - ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n" - ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n" - ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n" - ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n" - ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n" - ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n" - ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n" - ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n" - ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" - ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n" - ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" + ".inst 0x4e83a4b5 // smmla v21.4s, v5.16b, v3.16b\n" + ".inst 0x4e82a4b8 // smmla v24.4s, v5.16b, v2.16b\n" + ".inst 0x4e83a49b // smmla v27.4s, v4.16b, v3.16b\n" + ".inst 0x4e82a49e // smmla v30.4s, v4.16b, v2.16b\n" + ".inst 0x4e81a4ea // smmla v10.4s, v7.16b, v1.16b\n" + ".inst 0x4e80a4ed // smmla v13.4s, v7.16b, v0.16b\n" + ".inst 0x4e81a4d0 // smmla v16.4s, v6.16b, v1.16b\n" + ".inst 0x4e80a4d3 // smmla v19.4s, v6.16b, v0.16b\n" + ".inst 0x4e81a4b6 // smmla v22.4s, v5.16b, v1.16b\n" + ".inst 0x4e80a4b9 // smmla v25.4s, v5.16b, v0.16b\n" + ".inst 0x4e81a49c // smmla v28.4s, v4.16b, v1.16b\n" + ".inst 0x4e80a49f // smmla v31.4s, v4.16b, v0.16b\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp index fa93c1d90d..0088557b8d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef __aarch64__ + #include "../std_transforms_fixed.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return 12; } - static unsigned int stripe_width() - { - return 4; - } - static constexpr unsigned int k_unroll() { return 8; @@ -116,5 +111,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp index 83301d80bb..54c51954c8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_u8u32_mmla_8x12_a510( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *Apanel, + const uint8_t *Bpanel, + uint32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_u8u32_mmla_8x12_a510( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -82,28 +85,28 @@ void a64_interleaved_u8u32_mmla_8x12_a510( "movi v31.4s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n" - "ldp q6, q7, [x22], #0x20\n" + "ldp q3, q7, [x22], #0x20\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" "sub x20, x20, #0x2\n" ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + ".inst 0x6e84a4da // ummla v26.4s, v6.16b, v4.16b\n" "cmp x20, #0x2\n" - ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" + ".inst 0x6e85a4dd // ummla v29.4s, v6.16b, v5.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" + ".inst 0x6e83a409 // ummla v9.4s, v0.16b, v3.16b\n" ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e83a42f // ummla v15.4s, v1.16b, v3.16b\n" ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" + ".inst 0x6e83a455 // ummla v21.4s, v2.16b, v3.16b\n" ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x6e83a4db // ummla v27.4s, v6.16b, v3.16b\n" + ".inst 0x6e87a4de // ummla v30.4s, v6.16b, v7.16b\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" @@ -113,39 +116,39 @@ void a64_interleaved_u8u32_mmla_8x12_a510( ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" - ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" + ".inst 0x6e84a4dc // ummla v28.4s, v6.16b, v4.16b\n" + ".inst 0x6e85a4df // ummla v31.4s, v6.16b, v5.16b\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" + ".inst 0x6e83a40b // ummla v11.4s, v0.16b, v3.16b\n" + ".inst 0x6e87a42e // ummla v14.4s, v1.16b, v7.16b\n" + ".inst 0x6e83a431 // ummla v17.4s, v1.16b, v3.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + ".inst 0x6e83a457 // ummla v23.4s, v2.16b, v3.16b\n" + ".inst 0x6e87a4da // ummla v26.4s, v6.16b, v7.16b\n" + ".inst 0x6e83a4dd // ummla v29.4s, v6.16b, v3.16b\n" + "ldp q7, q3, [x22], #0x20\n" ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" - ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" + ".inst 0x6e84a4db // ummla v27.4s, v6.16b, v4.16b\n" + ".inst 0x6e85a4de // ummla v30.4s, v6.16b, v5.16b\n" "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e83a40d // ummla v13.4s, v0.16b, v3.16b\n" "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a430 // ummla v16.4s, v1.16b, v7.16b\n" + ".inst 0x6e83a433 // ummla v19.4s, v1.16b, v3.16b\n" "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + ".inst 0x6e83a459 // ummla v25.4s, v2.16b, v3.16b\n" "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n" + ".inst 0x6e83a4df // ummla v31.4s, v6.16b, v3.16b\n" "bge 3b\n" "4:" // main loop skip "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" @@ -158,7 +161,7 @@ void a64_interleaved_u8u32_mmla_8x12_a510( ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" - "ldp q4, q5, [x22], #0x20\n" + "ldp q5, q4, [x22], #0x20\n" ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" @@ -167,93 +170,93 @@ void a64_interleaved_u8u32_mmla_8x12_a510( ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n" ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n" ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n" - ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" - ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" - ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" - ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" - ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" - ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" + ".inst 0x6e85a40a // ummla v10.4s, v0.16b, v5.16b\n" + ".inst 0x6e84a40d // ummla v13.4s, v0.16b, v4.16b\n" + ".inst 0x6e85a430 // ummla v16.4s, v1.16b, v5.16b\n" + ".inst 0x6e84a433 // ummla v19.4s, v1.16b, v4.16b\n" + ".inst 0x6e85a456 // ummla v22.4s, v2.16b, v5.16b\n" + ".inst 0x6e84a459 // ummla v25.4s, v2.16b, v4.16b\n" + ".inst 0x6e85a47c // ummla v28.4s, v3.16b, v5.16b\n" + ".inst 0x6e84a47f // ummla v31.4s, v3.16b, v4.16b\n" "cbz x20, 5f\n" - "ldp q6, q7, [x22], #0x20\n" - "ld1 { v0.16b }, [%x[Apanel]], #0x10\n" - "ld1 { v1.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - "ld1 { v2.16b }, [%x[Apanel]], #0x10\n" - "ld1 { v3.16b }, [%x[Apanel]], #0x10\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldp q4, q5, [x22], #0x20\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" - "ldp q6, q7, [x22], #0x20\n" - ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" - ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" - ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" - ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" - ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" - ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + "ldp q1, q0, [x22], #0x20\n" + "ld1 { v7.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v6.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n" + "ld1 { v5.16b }, [%x[Apanel]], #0x10\n" + "ld1 { v4.16b }, [%x[Apanel]], #0x10\n" + ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n" + "ldp q3, q2, [x22], #0x20\n" + ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n" + ".inst 0x6e80a4d1 // ummla v17.4s, v6.16b, v0.16b\n" + ".inst 0x6e81a4b4 // ummla v20.4s, v5.16b, v1.16b\n" + ".inst 0x6e80a4b7 // ummla v23.4s, v5.16b, v0.16b\n" + ".inst 0x6e81a49a // ummla v26.4s, v4.16b, v1.16b\n" + ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n" + "ldp q1, q0, [x22], #0x20\n" + ".inst 0x6e83a4e9 // ummla v9.4s, v7.16b, v3.16b\n" + ".inst 0x6e82a4ec // ummla v12.4s, v7.16b, v2.16b\n" + ".inst 0x6e83a4cf // ummla v15.4s, v6.16b, v3.16b\n" + ".inst 0x6e82a4d2 // ummla v18.4s, v6.16b, v2.16b\n" + ".inst 0x6e83a4b5 // ummla v21.4s, v5.16b, v3.16b\n" + ".inst 0x6e82a4b8 // ummla v24.4s, v5.16b, v2.16b\n" + ".inst 0x6e83a49b // ummla v27.4s, v4.16b, v3.16b\n" + ".inst 0x6e82a49e // ummla v30.4s, v4.16b, v2.16b\n" + ".inst 0x6e81a4ea // ummla v10.4s, v7.16b, v1.16b\n" + ".inst 0x6e80a4ed // ummla v13.4s, v7.16b, v0.16b\n" + ".inst 0x6e81a4d0 // ummla v16.4s, v6.16b, v1.16b\n" + ".inst 0x6e80a4d3 // ummla v19.4s, v6.16b, v0.16b\n" + ".inst 0x6e81a4b6 // ummla v22.4s, v5.16b, v1.16b\n" + ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n" + ".inst 0x6e81a49c // ummla v28.4s, v4.16b, v1.16b\n" + ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp index c5342197c1..30260b9c29 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void a64_interleaved_u8u32_mmla_8x12( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *Apanel, + const uint8_t *Bpanel, + uint32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -43,7 +47,6 @@ void a64_interleaved_u8u32_mmla_8x12( ka.bblocks = bblocks; __asm__ __volatile__( - "1:" // Height loop "ldr x23, [%x[args_ptr], %[offsetof_bblocks]]\n" "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" @@ -85,31 +88,31 @@ void a64_interleaved_u8u32_mmla_8x12( "movi v31.4s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ldr q3, [%x[Apanel], #0x0]\n" - "ldr q6, [x22, #0x0]\n" + "ldr q6, [%x[Apanel], #0x0]\n" + "ldr q7, [x22, #0x0]\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" - "ldr q7, [x22, #0x10]\n" + "ldr q3, [x22, #0x10]\n" ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" "sub x20, x20, #0x2\n" ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + ".inst 0x6e84a4da // ummla v26.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0x20]\n" - ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" + ".inst 0x6e85a4dd // ummla v29.4s, v6.16b, v5.16b\n" "ldr q5, [x22, #0x30]\n" - ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + ".inst 0x6e83a40c // ummla v12.4s, v0.16b, v3.16b\n" + ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n" "cmp x20, #0x2\n" - ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x50]\n" + ".inst 0x6e83a432 // ummla v18.4s, v1.16b, v3.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + ".inst 0x6e83a458 // ummla v24.4s, v2.16b, v3.16b\n" + ".inst 0x6e87a4db // ummla v27.4s, v6.16b, v7.16b\n" + "ldr q7, [x22, #0x40]\n" + ".inst 0x6e83a4de // ummla v30.4s, v6.16b, v3.16b\n" + "ldr q3, [x22, #0x50]\n" ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" "ldr q0, [%x[Apanel], #0x10]\n" @@ -119,42 +122,42 @@ void a64_interleaved_u8u32_mmla_8x12( ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" "ldr q2, [%x[Apanel], #0x30]\n" - ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" + ".inst 0x6e84a4dc // ummla v28.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0x60]\n" - ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" - "ldr q3, [%x[Apanel], #0x40]\n" + ".inst 0x6e85a4df // ummla v31.4s, v6.16b, v5.16b\n" + "ldr q6, [%x[Apanel], #0x40]\n" "ldr q5, [x22, #0x70]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x80]\n" - ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x90]\n" + ".inst 0x6e87a408 // ummla v8.4s, v0.16b, v7.16b\n" + ".inst 0x6e83a40b // ummla v11.4s, v0.16b, v3.16b\n" + ".inst 0x6e87a42e // ummla v14.4s, v1.16b, v7.16b\n" + ".inst 0x6e83a431 // ummla v17.4s, v1.16b, v3.16b\n" + ".inst 0x6e87a454 // ummla v20.4s, v2.16b, v7.16b\n" + ".inst 0x6e83a457 // ummla v23.4s, v2.16b, v3.16b\n" + ".inst 0x6e87a4da // ummla v26.4s, v6.16b, v7.16b\n" + "ldr q7, [x22, #0x80]\n" + ".inst 0x6e83a4dd // ummla v29.4s, v6.16b, v3.16b\n" + "ldr q3, [x22, #0x90]\n" ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" + ".inst 0x6e84a4db // ummla v27.4s, v6.16b, v4.16b\n" "ldr q4, [x22, #0xa0]\n" - ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" + ".inst 0x6e85a4de // ummla v30.4s, v6.16b, v5.16b\n" "ldr q5, [x22, #0xb0]\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a40a // ummla v10.4s, v0.16b, v7.16b\n" + ".inst 0x6e83a40d // ummla v13.4s, v0.16b, v3.16b\n" "ldr q0, [%x[Apanel], #0x50]\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" + ".inst 0x6e87a430 // ummla v16.4s, v1.16b, v7.16b\n" + ".inst 0x6e83a433 // ummla v19.4s, v1.16b, v3.16b\n" "ldr q1, [%x[Apanel], #0x60]\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a456 // ummla v22.4s, v2.16b, v7.16b\n" + ".inst 0x6e83a459 // ummla v25.4s, v2.16b, v3.16b\n" "ldr q2, [%x[Apanel], #0x70]\n" - ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + ".inst 0x6e87a4dc // ummla v28.4s, v6.16b, v7.16b\n" + ".inst 0x6e83a4df // ummla v31.4s, v6.16b, v3.16b\n" "add %x[Apanel], %x[Apanel], #0x80\n" "add x22, x22, #0xc0\n" "bge 3b\n" @@ -191,89 +194,89 @@ void a64_interleaved_u8u32_mmla_8x12( ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" "cbz x20, 5f\n" - "ldr q6, [x22, #0x0]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q7, [x22, #0x10]\n" - ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" - "ldr q2, [%x[Apanel], #0x20]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "ldr q4, [x22, #0x20]\n" - "ldr q5, [x22, #0x30]\n" - ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + "ldr q1, [x22, #0x0]\n" + "ldr q7, [%x[Apanel], #0x0]\n" + ".inst 0x6e81a4e8 // ummla v8.4s, v7.16b, v1.16b\n" + "ldr q6, [%x[Apanel], #0x10]\n" + "ldr q0, [x22, #0x10]\n" + ".inst 0x6e80a4eb // ummla v11.4s, v7.16b, v0.16b\n" + "ldr q5, [%x[Apanel], #0x20]\n" + "ldr q4, [%x[Apanel], #0x30]\n" + ".inst 0x6e81a4ce // ummla v14.4s, v6.16b, v1.16b\n" + "ldr q3, [x22, #0x20]\n" + "ldr q2, [x22, #0x30]\n" + ".inst 0x6e80a4d1 // ummla v17.4s, v6.16b, v0.16b\n" + ".inst 0x6e81a4b4 // ummla v20.4s, v5.16b, v1.16b\n" + ".inst 0x6e80a4b7 // ummla v23.4s, v5.16b, v0.16b\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" - "ldr q6, [x22, #0x40]\n" - ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" - "ldr q7, [x22, #0x50]\n" - ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n" - ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n" - ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n" - ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n" + ".inst 0x6e81a49a // ummla v26.4s, v4.16b, v1.16b\n" + "ldr q1, [x22, #0x40]\n" + ".inst 0x6e80a49d // ummla v29.4s, v4.16b, v0.16b\n" + "ldr q0, [x22, #0x50]\n" + ".inst 0x6e83a4e9 // ummla v9.4s, v7.16b, v3.16b\n" + ".inst 0x6e82a4ec // ummla v12.4s, v7.16b, v2.16b\n" + ".inst 0x6e83a4cf // ummla v15.4s, v6.16b, v3.16b\n" + ".inst 0x6e82a4d2 // ummla v18.4s, v6.16b, v2.16b\n" "add x22, x22, #0x60\n" - ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n" - ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n" - ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n" - ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n" - ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" - ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" - ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" - ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" - ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" - ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" - ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" - ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" + ".inst 0x6e83a4b5 // ummla v21.4s, v5.16b, v3.16b\n" + ".inst 0x6e82a4b8 // ummla v24.4s, v5.16b, v2.16b\n" + ".inst 0x6e83a49b // ummla v27.4s, v4.16b, v3.16b\n" + ".inst 0x6e82a49e // ummla v30.4s, v4.16b, v2.16b\n" + ".inst 0x6e81a4ea // ummla v10.4s, v7.16b, v1.16b\n" + ".inst 0x6e80a4ed // ummla v13.4s, v7.16b, v0.16b\n" + ".inst 0x6e81a4d0 // ummla v16.4s, v6.16b, v1.16b\n" + ".inst 0x6e80a4d3 // ummla v19.4s, v6.16b, v0.16b\n" + ".inst 0x6e81a4b6 // ummla v22.4s, v5.16b, v1.16b\n" + ".inst 0x6e80a4b9 // ummla v25.4s, v5.16b, v0.16b\n" + ".inst 0x6e81a49c // ummla v28.4s, v4.16b, v1.16b\n" + ".inst 0x6e80a49f // ummla v31.4s, v4.16b, v0.16b\n" "5:" // multiply loop done "subs x23, x23, #0x1\n" - "uzp1 v4.2d, v8.2d, v11.2d\n" + "uzp1 v0.2d, v8.2d, v11.2d\n" "uzp2 v8.2d, v8.2d, v11.2d\n" - "uzp1 v11.2d, v9.2d, v12.2d\n" + "uzp1 v1.2d, v9.2d, v12.2d\n" "uzp2 v9.2d, v9.2d, v12.2d\n" - "str q4, [%x[Cpanel], #0x0]\n" - "uzp1 v12.2d, v10.2d, v13.2d\n" + "str q0, [%x[Cpanel], #0x0]\n" + "uzp1 v0.2d, v10.2d, v13.2d\n" "uzp2 v10.2d, v10.2d, v13.2d\n" - "str q11, [%x[Cpanel], #0x10]\n" - "str q12, [%x[Cpanel], #0x20]\n" - "uzp1 v13.2d, v14.2d, v17.2d\n" + "str q1, [%x[Cpanel], #0x10]\n" + "str q0, [%x[Cpanel], #0x20]\n" + "uzp1 v0.2d, v14.2d, v17.2d\n" "uzp2 v14.2d, v14.2d, v17.2d\n" "str q8, [%x[Cpanel], #0x30]\n" - "uzp1 v17.2d, v15.2d, v18.2d\n" + "uzp1 v2.2d, v15.2d, v18.2d\n" "uzp2 v15.2d, v15.2d, v18.2d\n" "str q9, [%x[Cpanel], #0x40]\n" - "uzp1 v18.2d, v16.2d, v19.2d\n" + "uzp1 v17.2d, v16.2d, v19.2d\n" "uzp2 v16.2d, v16.2d, v19.2d\n" "str q10, [%x[Cpanel], #0x50]\n" - "uzp1 v19.2d, v20.2d, v23.2d\n" + "uzp1 v1.2d, v20.2d, v23.2d\n" "uzp2 v20.2d, v20.2d, v23.2d\n" - "str q13, [%x[Cpanel], #0x60]\n" - "uzp1 v23.2d, v21.2d, v24.2d\n" + "str q0, [%x[Cpanel], #0x60]\n" + "uzp1 v0.2d, v21.2d, v24.2d\n" "uzp2 v21.2d, v21.2d, v24.2d\n" - "str q17, [%x[Cpanel], #0x70]\n" - "uzp1 v24.2d, v22.2d, v25.2d\n" + "str q2, [%x[Cpanel], #0x70]\n" + "uzp1 v23.2d, v22.2d, v25.2d\n" "uzp2 v22.2d, v22.2d, v25.2d\n" - "str q18, [%x[Cpanel], #0x80]\n" - "uzp1 v25.2d, v26.2d, v29.2d\n" + "str q17, [%x[Cpanel], #0x80]\n" + "uzp1 v19.2d, v26.2d, v29.2d\n" "uzp2 v26.2d, v26.2d, v29.2d\n" "str q14, [%x[Cpanel], #0x90]\n" - "uzp1 v29.2d, v27.2d, v30.2d\n" + "uzp1 v18.2d, v27.2d, v30.2d\n" "uzp2 v27.2d, v27.2d, v30.2d\n" "str q15, [%x[Cpanel], #0xa0]\n" - "uzp1 v30.2d, v28.2d, v31.2d\n" + "uzp1 v17.2d, v28.2d, v31.2d\n" "uzp2 v28.2d, v28.2d, v31.2d\n" "str q16, [%x[Cpanel], #0xb0]\n" - "str q19, [%x[Cpanel], #0xc0]\n" - "str q23, [%x[Cpanel], #0xd0]\n" - "str q24, [%x[Cpanel], #0xe0]\n" + "str q1, [%x[Cpanel], #0xc0]\n" + "str q0, [%x[Cpanel], #0xd0]\n" + "str q23, [%x[Cpanel], #0xe0]\n" "str q20, [%x[Cpanel], #0xf0]\n" "str q21, [%x[Cpanel], #0x100]\n" "str q22, [%x[Cpanel], #0x110]\n" - "str q25, [%x[Cpanel], #0x120]\n" - "str q29, [%x[Cpanel], #0x130]\n" - "str q30, [%x[Cpanel], #0x140]\n" + "str q19, [%x[Cpanel], #0x120]\n" + "str q18, [%x[Cpanel], #0x130]\n" + "str q17, [%x[Cpanel], #0x140]\n" "str q26, [%x[Cpanel], #0x150]\n" "str q27, [%x[Cpanel], #0x160]\n" "str q28, [%x[Cpanel], #0x170]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp index f86bcebe64..76f43f0933 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,19 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - -#ifdef __aarch64__ +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "../std_transforms_sme.hpp" #include "../bfloat.hpp" @@ -84,4 +83,4 @@ public: #undef ARGLIST -#endif // __aarch64__ +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp index 520eeedfec..db29e42ef1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE -#ifdef ARM_COMPUTE_ENABLE_SME2 + +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "arm_gemm.hpp" #include "../../utils.hpp" @@ -62,7 +62,7 @@ void sme2_gemv_bf16fp32_dot_16VL ( break; } __asm__ __volatile__( - "ptrue p1.b\n" + "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" "cntw x28, ALL, MUL #4\n" "add x27, %x[N], x28\n" @@ -102,311 +102,311 @@ void sme2_gemv_bf16fp32_dot_16VL ( "bgt 20f\n" "beq 12f\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x1\n" + "lsl x21, %x[K], #0x1\n" "mov x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 5f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" + ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n" "b 6f\n" "5:" // Width 1: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "6:" // Width 1: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 8f\n" "7:" // Width 1: Multiply loop: Main loop head - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "sub x21, x21, #0x8\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z8.h }, p0/Z, [x23]\n" + "sub x22, x22, #0x8\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158b298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[0]\n" "addvl x26, x26, #16\n" - "cmp x21, #0x8\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" + "cmp x22, #0x8\n" + ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158b498 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[1]\n" "addvl x26, x26, #16\n" "add x23, x23, #0x10\n" - ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" + ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158bb98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z8.h[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158bf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z8.h[3]\n" "addvl x26, x26, #16\n" "bgt 7b\n" "8:" // Width 1: Multiply loop: Single iteration only - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "subs x21, x21, #0x2\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z11.h }, p0/Z, [x23]\n" + "subs x22, x22, #0x2\n" + ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - "addvl x26, x26, #16\n" - "ble 9f\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" + ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n" "addvl x26, x26, #16\n" "ble 9f\n" ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n" "addvl x26, x26, #16\n" "ble 9f\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n" + "addvl x26, x26, #16\n" + "ble 9f\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15bbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[3]\n" "addvl x26, x26, #16\n" "9:" // Width 1: Multiply loop: multiply skip "tbz %x[flags], #1, 10f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" + "ld1rw { z3.s }, p1/Z, [x21]\n" + "ld1rw { z29.s }, p1/Z, [x20]\n" + ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n" ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "b 11f\n" "10:" // Width 1: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "11:" // Width 1: Output done "b 36f\n" "12:" // Width 2 "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x1\n" + "lsl x21, %x[K], #0x1\n" "sub x20, %x[N], x28\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 13f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" + ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" "b 14f\n" "13:" // Width 2: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "14:" // Width 2: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 16f\n" "15:" // Width 2: Multiply loop: Main loop head - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "sub x21, x21, #0x8\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - "cmp x21, #0x8\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z9.h }, p0/Z, [x23]\n" + "sub x22, x22, #0x8\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" + ".inst 0xc159b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[0]\n" + "cmp x22, #0x8\n" "add x23, x23, #0x10\n" ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" + ".inst 0xc159b099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[0]\n" "addvl x26, x26, #16\n" ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" + ".inst 0xc159b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z9.h[1]\n" + ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z9.h[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" + ".inst 0xc159bb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z9.h[2]\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159b819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z9.h[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + ".inst 0xc159bc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z9.h[3]\n" + ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159bf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z9.h[3]\n" "addvl x26, x26, #16\n" "bgt 15b\n" "16:" // Width 2: Multiply loop: Single iteration only - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "subs x21, x21, #0x2\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z11.h }, p0/Z, [x23]\n" + "subs x22, x22, #0x2\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" + ".inst 0xc15bb198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[0]\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z11.h[1]\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bb419 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[1]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[2]\n" + ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bbb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z11.h[2]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" + ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n" "addvl x26, x26, #16\n" "17:" // Width 2: Multiply loop: multiply skip "tbz %x[flags], #1, 18f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n" + "ld1rw { z9.s }, p1/Z, [x21]\n" + ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n" + "ld1rw { z8.s }, p1/Z, [x20]\n" + ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n" + ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n" + ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n" + ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "b 19f\n" "18:" // Width 2: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n" + ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n" + ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" + ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "19:" // Width 2: Output done "b 36f\n" "20:" // Width 3 "mov x20, #0x2\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x1\n" + "lsl x21, %x[K], #0x1\n" "msub x20, x28, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 21f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" - ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n" + ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n" + ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n" + ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n" "b 22f\n" "21:" // Width 3: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "22:" // Width 3: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 24f\n" "23:" // Width 3: Multiply loop: Main loop head - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "sub x21, x21, #0x8\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z15.h }, p0/Z, [x23]\n" + "sub x22, x22, #0x8\n" ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - "cmp x21, #0x8\n" + ".inst 0xc15fb018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[0]\n" + "cmp x22, #0x8\n" "add x23, x23, #0x10\n" ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" - ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n" + ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n" + ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" - ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n" + ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fb51a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" - ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15fbb18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[2]\n" + ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb919 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[2]\n" + ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n" "addvl x26, x26, #16\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n" + ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fbd1a // bfdot za.s[x9, 2], { z8.h-z11.h }, z15.h[3]\n" "addvl x26, x26, #16\n" "bgt 23b\n" "24:" // Width 3: Multiply loop: Single iteration only - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "subs x21, x21, #0x2\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z11.h }, p0/Z, [x23]\n" + "subs x22, x22, #0x2\n" + ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" + ".inst 0xc15bb398 // bfdot za.s[x9, 0], { z28.h-z31.h }, z11.h[0]\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z11.h[0]\n" ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n" - "addvl x26, x26, #16\n" - "ble 25f\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" - ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n" + ".inst 0xc15bb29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z11.h[0]\n" "addvl x26, x26, #16\n" "ble 25f\n" ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[1]\n" + ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z11.h[1]\n" ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n" + ".inst 0xc15bb79a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[1]\n" "addvl x26, x26, #16\n" "ble 25f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" + ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z11.h[2]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[2]\n" + ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15bb99a // bfdot za.s[x9, 2], { z12.h-z15.h }, z11.h[2]\n" + "addvl x26, x26, #16\n" + "ble 25f\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15bbd98 // bfdot za.s[x9, 0], { z12.h-z15.h }, z11.h[3]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bbe99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[3]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n" "addvl x26, x26, #16\n" "25:" // Width 3: Multiply loop: multiply skip "tbz %x[flags], #1, 26f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n" - ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n" + "ld1rw { z17.s }, p1/Z, [x21]\n" + ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n" + "ld1rw { z16.s }, p1/Z, [x20]\n" + ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n" + ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n" + ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n" + ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "b 27f\n" "26:" // Width 3: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n" + ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n" + ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" + ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "27:" // Width 3: Output done "b 36f\n" "28:" // Width 4 "mov x20, #0x3\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x1\n" + "lsl x21, %x[K], #0x1\n" "msub x20, x28, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 29f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" - ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n" + ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" + ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n" ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n" ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n" "addvl x24, x24, #16\n" @@ -414,126 +414,126 @@ void sme2_gemv_bf16fp32_dot_16VL ( "29:" // Width 4: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "30:" // Width 4: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 32f\n" "31:" // Width 4: Multiply loop: Main loop head - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "sub x21, x21, #0x8\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - "cmp x21, #0x8\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z8.h }, p0/Z, [x23]\n" + "sub x22, x22, #0x8\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158b218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z8.h[0]\n" + "cmp x22, #0x8\n" "add x23, x23, #0x10\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" - ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n" - ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" - ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n" - ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n" + ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158b199 // bfdot za.s[x9, 1], { z12.h-z15.h }, z8.h[0]\n" + ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[0]\n" + ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158b19b // bfdot za.s[x9, 3], { z12.h-z15.h }, z8.h[0]\n" "addvl x26, x26, #16\n" ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" - ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" - ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n" + ".inst 0xc158b598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z8.h[1]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158b699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z8.h[1]\n" + ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158b61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[1]\n" ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n" + ".inst 0xc158b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" + ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158b898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z8.h[2]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158ba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[2]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n" - ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc158ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z8.h[2]\n" + ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158b81b // bfdot za.s[x9, 3], { z0.h-z3.h }, z8.h[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + ".inst 0xc158be98 // bfdot za.s[x9, 0], { z20.h-z23.h }, z8.h[3]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158be19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z8.h[3]\n" + ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158bc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z8.h[3]\n" + ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158be9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z8.h[3]\n" "addvl x26, x26, #16\n" "bgt 31b\n" "32:" // Width 4: Multiply loop: Single iteration only - "whilelt p0.h, XZR, x21\n" - "ld1rqh { z10.h }, p0/Z, [x23]\n" - "subs x21, x21, #0x2\n" - ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "whilelt p0.h, XZR, x22\n" + "ld1rqh { z11.h }, p0/Z, [x23]\n" + "subs x22, x22, #0x2\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n" - ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n" + ".inst 0xc15bb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[0]\n" + ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bb299 // bfdot za.s[x9, 1], { z20.h-z23.h }, z11.h[0]\n" + ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15bb39a // bfdot za.s[x9, 2], { z28.h-z31.h }, z11.h[0]\n" ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n" + ".inst 0xc15bb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[0]\n" "addvl x26, x26, #16\n" "ble 33f\n" - ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z11.h[1]\n" ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n" - ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n" + ".inst 0xc15bb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[1]\n" + ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15bb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[1]\n" ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n" + ".inst 0xc15bb61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[1]\n" "addvl x26, x26, #16\n" "ble 33f\n" - ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15bba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[2]\n" ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n" - ".inst 0xa042a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n" - ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n" + ".inst 0xc15bba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z11.h[2]\n" + ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15bba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[2]\n" + ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15bba1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[2]\n" "addvl x26, x26, #16\n" "ble 33f\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc15bbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z11.h[3]\n" ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n" + ".inst 0xc15bbf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z11.h[3]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc15bbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z11.h[3]\n" ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n" + ".inst 0xc15bbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z11.h[3]\n" "addvl x26, x26, #16\n" "33:" // Width 4: Multiply loop: multiply skip "tbz %x[flags], #1, 34f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n" - ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n" - ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + "ld1rw { z21.s }, p1/Z, [x21]\n" + ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n" + "ld1rw { z20.s }, p1/Z, [x20]\n" + ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n" + ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" + ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n" + ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n" + ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n" + ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" + ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "b 35f\n" "34:" // Width 4: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n" ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n" - ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n" + ".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "35:" // Width 4: Output done "subs x27, x27, #0x4\n" @@ -541,7 +541,7 @@ void sme2_gemv_bf16fp32_dot_16VL ( "bgt 4b\n" "36:" // Exit ".inst 0xd503467f // SMSTOP\n" - "ptrue p1.b\n" + "ptrue p8.b\n" : [N] "+&r" (N) : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -550,5 +550,4 @@ void sme2_gemv_bf16fp32_dot_16VL ( } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SME2 -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp index f33cb9a33d..7d98d5cb98 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,19 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - -#ifdef __aarch64__ +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "../std_transforms_sme.hpp" #define ARGLIST \ @@ -83,4 +82,4 @@ public: #undef ARGLIST -#endif // __aarch64__ +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp index 9224868e6a..d2c260536d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE -#ifdef ARM_COMPUTE_ENABLE_SME2 + +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "arm_gemm.hpp" #include "../../utils.hpp" @@ -61,7 +61,7 @@ void sme2_gemv_fp32_mla_16VL ( break; } __asm__ __volatile__( - "ptrue p1.b\n" + "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" "cntw x28, ALL, MUL #4\n" "add x27, %x[N], x28\n" @@ -101,311 +101,311 @@ void sme2_gemv_fp32_mla_16VL ( "bgt 20f\n" "beq 12f\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "mov x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 5f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" + ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n" "b 6f\n" "5:" // Width 1: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "6:" // Width 1: setup done - "cmp x21, #0x4\n" + "cmp x22, #0x4\n" "ble 8f\n" "7:" // Width 1: Multiply loop: Main loop head - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "sub x21, x21, #0x4\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z8.s }, p0/Z, [x23]\n" + "sub x22, x22, #0x4\n" + ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158a280 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[0]\n" "addvl x26, x26, #16\n" - "cmp x21, #0x4\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" + "cmp x22, #0x4\n" + ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158a480 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[1]\n" "addvl x26, x26, #16\n" "add x23, x23, #0x10\n" - ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" + ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158ab80 // fmla za.s[x9, 0], { z28.s-z31.s }, z8.s[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158af00 // fmla za.s[x9, 0], { z24.s-z27.s }, z8.s[3]\n" "addvl x26, x26, #16\n" "bgt 7b\n" "8:" // Width 1: Multiply loop: Single iteration only - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "subs x21, x21, #0x1\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z11.s }, p0/Z, [x23]\n" + "subs x22, x22, #0x1\n" + ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - "addvl x26, x26, #16\n" - "ble 9f\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" + ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n" "addvl x26, x26, #16\n" "ble 9f\n" ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n" "addvl x26, x26, #16\n" "ble 9f\n" ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n" + "addvl x26, x26, #16\n" + "ble 9f\n" + ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + ".inst 0xc15bac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[3]\n" "addvl x26, x26, #16\n" "9:" // Width 1: Multiply loop: multiply skip "tbz %x[flags], #1, 10f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" + "ld1rw { z3.s }, p1/Z, [x21]\n" + "ld1rw { z29.s }, p1/Z, [x20]\n" + ".inst 0xc1bdc868 // fclamp { z8.s-z11.s }, z3.s, z29.s\n" ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "b 11f\n" "10:" // Width 1: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xa060c32c // st1w { z12.s-z15.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "11:" // Width 1: Output done "b 36f\n" "12:" // Width 2 "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "sub x20, %x[N], x28\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 13f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" + ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa041c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" "b 14f\n" "13:" // Width 2: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "14:" // Width 2: setup done - "cmp x21, #0x4\n" + "cmp x22, #0x4\n" "ble 16f\n" "15:" // Width 2: Multiply loop: Main loop head - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "sub x21, x21, #0x4\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - "cmp x21, #0x4\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z9.s }, p0/Z, [x23]\n" + "sub x22, x22, #0x4\n" + ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" + ".inst 0xc159a180 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[0]\n" + "cmp x22, #0x4\n" "add x23, x23, #0x10\n" ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" + ".inst 0xc159a081 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[0]\n" "addvl x26, x26, #16\n" ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" + ".inst 0xc159a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z9.s[1]\n" + ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159a481 // fmla za.s[x9, 1], { z4.s-z7.s }, z9.s[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" + ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" + ".inst 0xc159ab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z9.s[2]\n" + ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159a801 // fmla za.s[x9, 1], { z0.s-z3.s }, z9.s[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + ".inst 0xc159ac00 // fmla za.s[x9, 0], { z0.s-z3.s }, z9.s[3]\n" + ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc159af81 // fmla za.s[x9, 1], { z28.s-z31.s }, z9.s[3]\n" "addvl x26, x26, #16\n" "bgt 15b\n" "16:" // Width 2: Multiply loop: Single iteration only - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "subs x21, x21, #0x1\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z11.s }, p0/Z, [x23]\n" + "subs x22, x22, #0x1\n" + ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" + ".inst 0xc15ba180 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[0]\n" + ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba700 // fmla za.s[x9, 0], { z24.s-z27.s }, z11.s[1]\n" + ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ba401 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[1]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba980 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[2]\n" + ".inst 0xa041c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bab81 // fmla za.s[x9, 1], { z28.s-z31.s }, z11.s[2]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" + ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n" "addvl x26, x26, #16\n" "17:" // Width 2: Multiply loop: multiply skip "tbz %x[flags], #1, 18f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n" + "ld1rw { z9.s }, p1/Z, [x21]\n" + ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n" + "ld1rw { z8.s }, p1/Z, [x20]\n" + ".inst 0xc1a8c920 // fclamp { z0.s-z3.s }, z9.s, z8.s\n" + ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n" + ".inst 0xc1a8c924 // fclamp { z4.s-z7.s }, z9.s, z8.s\n" + ".inst 0xa061c324 // st1w { z4.s-z7.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "b 19f\n" "18:" // Width 2: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c334 // st1w { z20.s-z23.s }, p8, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n" + ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n" + ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" + ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "19:" // Width 2: Output done "b 36f\n" "20:" // Width 3 "mov x20, #0x2\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "msub x20, x28, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 21f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" - ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n" + ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n" + ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n" + ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n" "b 22f\n" "21:" // Width 3: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "22:" // Width 3: setup done - "cmp x21, #0x4\n" + "cmp x22, #0x4\n" "ble 24f\n" "23:" // Width 3: Multiply loop: Main loop head - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "sub x21, x21, #0x4\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z15.s }, p0/Z, [x23]\n" + "sub x22, x22, #0x4\n" ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - "cmp x21, #0x4\n" + ".inst 0xc15fa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z15.s[0]\n" + "cmp x22, #0x4\n" "add x23, x23, #0x10\n" ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" - ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n" + ".inst 0xc15fa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z15.s[0]\n" + ".inst 0xa042c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fa002 // fmla za.s[x9, 2], { z0.s-z3.s }, z15.s[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" - ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n" + ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n" + ".inst 0xc15fa680 // fmla za.s[x9, 0], { z20.s-z23.s }, z15.s[1]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fa681 // fmla za.s[x9, 1], { z20.s-z23.s }, z15.s[1]\n" + ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fa502 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" - ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n" + ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" + ".inst 0xc15fab00 // fmla za.s[x9, 0], { z24.s-z27.s }, z15.s[2]\n" + ".inst 0xa041c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fa901 // fmla za.s[x9, 1], { z8.s-z11.s }, z15.s[2]\n" + ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15faa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z15.s[2]\n" "addvl x26, x26, #16\n" ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" - ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc15fae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z15.s[3]\n" + ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z15.s[3]\n" + ".inst 0xa042c749 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fad02 // fmla za.s[x9, 2], { z8.s-z11.s }, z15.s[3]\n" "addvl x26, x26, #16\n" "bgt 23b\n" "24:" // Width 3: Multiply loop: Single iteration only - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "subs x21, x21, #0x1\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z11.s }, p0/Z, [x23]\n" + "subs x22, x22, #0x1\n" + ".inst 0xa040c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" + ".inst 0xc15ba380 // fmla za.s[x9, 0], { z28.s-z31.s }, z11.s[0]\n" + ".inst 0xa041c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ba001 // fmla za.s[x9, 1], { z0.s-z3.s }, z11.s[0]\n" ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n" - "addvl x26, x26, #16\n" - "ble 25f\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" - ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n" + ".inst 0xc15ba282 // fmla za.s[x9, 2], { z20.s-z23.s }, z11.s[0]\n" "addvl x26, x26, #16\n" "ble 25f\n" ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba580 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[1]\n" + ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ba481 // fmla za.s[x9, 1], { z4.s-z7.s }, z11.s[1]\n" ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n" + ".inst 0xc15ba782 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[1]\n" "addvl x26, x26, #16\n" "ble 25f\n" - ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" + ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba880 // fmla za.s[x9, 0], { z4.s-z7.s }, z11.s[2]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15baa81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[2]\n" + ".inst 0xa042c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15ba982 // fmla za.s[x9, 2], { z12.s-z15.s }, z11.s[2]\n" + "addvl x26, x26, #16\n" + "ble 25f\n" + ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" + ".inst 0xc15bad80 // fmla za.s[x9, 0], { z12.s-z15.s }, z11.s[3]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15bae81 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[3]\n" ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n" "addvl x26, x26, #16\n" "25:" // Width 3: Multiply loop: multiply skip "tbz %x[flags], #1, 26f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n" - ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n" + "ld1rw { z17.s }, p1/Z, [x21]\n" + ".inst 0xc0062c28 // mova { z8.d-z11.d }, za.d[x9, #1]\n" + "ld1rw { z16.s }, p1/Z, [x20]\n" + ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xa060c724 // st1w { z4.s-z7.s }, pn9.b, [x25]\n" + ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n" + ".inst 0xa061c728 // st1w { z8.s-z11.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n" + ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "b 27f\n" "26:" // Width 3: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n" + ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n" + ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" + ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "27:" // Width 3: Output done "b 36f\n" "28:" // Width 4 "mov x20, #0x3\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "msub x20, x28, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 29f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c708 // ld1w { z8.s-z11.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n" - ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n" + ".inst 0xa040c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" + ".inst 0xa042c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042d82 // mova za.d[x9, #2], { z12.d-z15.d }\n" ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n" ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n" "addvl x24, x24, #16\n" @@ -413,126 +413,126 @@ void sme2_gemv_fp32_mla_16VL ( "29:" // Width 4: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "30:" // Width 4: setup done - "cmp x21, #0x4\n" + "cmp x22, #0x4\n" "ble 32f\n" "31:" // Width 4: Multiply loop: Main loop head - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "sub x21, x21, #0x4\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - "cmp x21, #0x4\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z8.s }, p0/Z, [x23]\n" + "sub x22, x22, #0x4\n" + ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158a200 // fmla za.s[x9, 0], { z16.s-z19.s }, z8.s[0]\n" + "cmp x22, #0x4\n" "add x23, x23, #0x10\n" - ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" - ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n" - ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" - ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n" - ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n" + ".inst 0xa041c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158a181 // fmla za.s[x9, 1], { z12.s-z15.s }, z8.s[0]\n" + ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158a202 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[0]\n" + ".inst 0xa043c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158a183 // fmla za.s[x9, 3], { z12.s-z15.s }, z8.s[0]\n" "addvl x26, x26, #16\n" ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" - ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" - ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n" + ".inst 0xc158a580 // fmla za.s[x9, 0], { z12.s-z15.s }, z8.s[1]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158a681 // fmla za.s[x9, 1], { z20.s-z23.s }, z8.s[1]\n" + ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158a602 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[1]\n" ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n" + ".inst 0xc158a683 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" + ".inst 0xa040c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158a880 // fmla za.s[x9, 0], { z4.s-z7.s }, z8.s[2]\n" + ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158aa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[2]\n" ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n" - ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc158aa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z8.s[2]\n" + ".inst 0xa043c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158a803 // fmla za.s[x9, 3], { z0.s-z3.s }, z8.s[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26]\n" + ".inst 0xc158ae80 // fmla za.s[x9, 0], { z20.s-z23.s }, z8.s[3]\n" + ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc158ae01 // fmla za.s[x9, 1], { z16.s-z19.s }, z8.s[3]\n" + ".inst 0xa042c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc158ac82 // fmla za.s[x9, 2], { z4.s-z7.s }, z8.s[3]\n" + ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc158ae83 // fmla za.s[x9, 3], { z20.s-z23.s }, z8.s[3]\n" "addvl x26, x26, #16\n" "bgt 31b\n" "32:" // Width 4: Multiply loop: Single iteration only - "whilelt p0.s, XZR, x21\n" - "ld1rqw { z10.s }, p0/Z, [x23]\n" - "subs x21, x21, #0x1\n" - ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + "whilelt p0.s, XZR, x22\n" + "ld1rqw { z11.s }, p0/Z, [x23]\n" + "subs x22, x22, #0x1\n" + ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n" - ".inst 0xa041c745 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n" - ".inst 0xa042c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n" + ".inst 0xc15ba200 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[0]\n" + ".inst 0xa041c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ba281 // fmla za.s[x9, 1], { z20.s-z23.s }, z11.s[0]\n" + ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15ba382 // fmla za.s[x9, 2], { z28.s-z31.s }, z11.s[0]\n" ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n" + ".inst 0xc15ba203 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[0]\n" "addvl x26, x26, #16\n" "ble 33f\n" - ".inst 0xa040c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n" + ".inst 0xa040c741 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15ba400 // fmla za.s[x9, 0], { z0.s-z3.s }, z11.s[1]\n" ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n" - ".inst 0xa042c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n" + ".inst 0xc15ba601 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[1]\n" + ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15ba602 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[1]\n" ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n" + ".inst 0xc15ba603 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[1]\n" "addvl x26, x26, #16\n" "ble 33f\n" - ".inst 0xa040c74d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x1\n" - ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n" + ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x1\n" + ".inst 0xc15baa00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[2]\n" ".inst 0xa041c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n" - ".inst 0xa042c75d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n" - ".inst 0xa043c755 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n" + ".inst 0xc15baa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z11.s[2]\n" + ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15baa02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[2]\n" + ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15baa03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[2]\n" "addvl x26, x26, #16\n" "ble 33f\n" ".inst 0xa040c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26]\n" - ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc15bae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z11.s[3]\n" ".inst 0xa041c759 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n" + ".inst 0xc15baf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z11.s[3]\n" ".inst 0xa042c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc15bae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z11.s[3]\n" ".inst 0xa043c751 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n" + ".inst 0xc15bae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z11.s[3]\n" "addvl x26, x26, #16\n" "33:" // Width 4: Multiply loop: multiply skip "tbz %x[flags], #1, 34f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z0.s }, p1/Z, [x21]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - "ld1rw { z6.s }, p1/Z, [x20]\n" - ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n" - ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n" - ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n" - ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n" - ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + "ld1rw { z21.s }, p1/Z, [x21]\n" + ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n" + "ld1rw { z20.s }, p1/Z, [x20]\n" + ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n" + ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" + ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n" + ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n" + ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n" + ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" + ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "b 35f\n" "34:" // Width 4: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n" - ".inst 0xa061c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n" ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" ".inst 0xa062c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n" - ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c64 // mova { z4.d-z7.d }, za.d[x9, #3]\n" + ".inst 0xa063c324 // st1w { z4.s-z7.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "35:" // Width 4: Output done "subs x27, x27, #0x4\n" @@ -540,7 +540,7 @@ void sme2_gemv_fp32_mla_16VL ( "bgt 4b\n" "36:" // Exit ".inst 0xd503467f // SMSTOP\n" - "ptrue p1.b\n" + "ptrue p8.b\n" : [N] "+&r" (N) : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -549,5 +549,4 @@ void sme2_gemv_fp32_mla_16VL ( } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SME2 -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp index f52fbcd57f..76c2bdd71e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,19 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - -#ifdef __aarch64__ +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "../std_transforms_sme.hpp" #include "../bfloat.hpp" @@ -84,4 +83,4 @@ public: #undef ARGLIST -#endif // __aarch64__ +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp index 0a394b6413..c6fa11016f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE -#ifdef ARM_COMPUTE_ENABLE_SME2 + +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "arm_gemm.hpp" #include "../../utils.hpp" @@ -62,7 +62,7 @@ void sme2_gemv_fp32bf16fp32_dot_16VL ( break; } __asm__ __volatile__( - "ptrue p2.b\n" + "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" "cntw x10, ALL, MUL #4\n" "add x28, %x[N], x10\n" @@ -103,494 +103,494 @@ void sme2_gemv_fp32bf16fp32_dot_16VL ( "bgt 20f\n" "beq 12f\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "mov x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 5f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" + ".inst 0xa040c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n" "b 6f\n" "5:" // Width 1: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "6:" // Width 1: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 8f\n" "7:" // Width 1: Multiply loop: Main loop head - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "sub x21, x21, #0x8\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "addvl x26, x26, #16\n" - "cmp x21, #0x8\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z10.s }, p1/Z, [x23]\n" + ".inst 0x658aa94a // bfcvt z10.h, p2/M, z10.s\n" + "ld1rqw { z16.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n" + "uzp1 z10.h, z10.h, z10.h\n" + "sub x22, x22, #0x8\n" + "uzp1 z16.h, z16.h, z16.h\n" + "trn1 z10.d, z10.d, z16.d\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" + "addvl x26, x26, #16\n" + ".inst 0xc15ab198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[0]\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" + "addvl x26, x26, #16\n" + "cmp x22, #0x8\n" + ".inst 0xc15ab598 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[1]\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" "addvl x26, x26, #16\n" "add x23, x23, #0x20\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" + ".inst 0xc15ab818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[2]\n" ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" "addvl x26, x26, #16\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15abf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z10.h[3]\n" "bgt 7b\n" "8:" // Width 1: Multiply loop: Single iteration only - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "subs x21, x21, #0x2\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z15.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n" + "ld1rqw { z17.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aaa31 // bfcvt z17.h, p2/M, z17.s\n" + "uzp1 z15.h, z15.h, z15.h\n" + "subs x22, x22, #0x2\n" + "uzp1 z17.h, z17.h, z17.h\n" + "trn1 z15.d, z15.d, z17.d\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x20\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" + ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n" "addvl x26, x26, #16\n" "ble 9f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb418 // bfdot za.s[x9, 0], { z0.h-z3.h }, z15.h[1]\n" "addvl x26, x26, #16\n" "ble 9f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" + ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n" "addvl x26, x26, #16\n" "ble 9f\n" - ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15fbd18 // bfdot za.s[x9, 0], { z8.h-z11.h }, z15.h[3]\n" "addvl x26, x26, #16\n" "9:" // Width 1: Multiply loop: multiply skip "tbz %x[flags], #1, 10f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z29.s }, p2/Z, [x21]\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" - ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n" - ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" + ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n" + "ld1rw { z8.s }, p2/Z, [x21]\n" + "ld1rw { z26.s }, p2/Z, [x20]\n" + ".inst 0xc1bac900 // fclamp { z0.s-z3.s }, z8.s, z26.s\n" + ".inst 0xa060c320 // st1w { z0.s-z3.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "b 11f\n" "10:" // Width 1: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c328 // st1w { z8.s-z11.s }, p8, [x25]\n" + ".inst 0xc0062c04 // mova { z4.d-z7.d }, za.d[x9, #0]\n" + ".inst 0xa060c324 // st1w { z4.s-z7.s }, p8, [x25]\n" "addvl x25, x25, #4\n" "11:" // Width 1: Output done "b 36f\n" "12:" // Width 2 "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "sub x20, %x[N], x10\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 13f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" + ".inst 0xa040c718 // ld1w { z24.s-z27.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f00 // mova za.d[x9, #0], { z24.d-z27.d }\n" ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" "b 14f\n" "13:" // Width 2: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "14:" // Width 2: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 16f\n" "15:" // Width 2: Multiply loop: Main loop head - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "sub x21, x21, #0x8\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" - "cmp x21, #0x8\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z13.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n" + "ld1rqw { z27.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aab7b // bfcvt z27.h, p2/M, z27.s\n" + "uzp1 z13.h, z13.h, z13.h\n" + "sub x22, x22, #0x8\n" + "uzp1 z27.h, z27.h, z27.h\n" + "trn1 z13.d, z13.d, z27.d\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + "cmp x22, #0x8\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15db298 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[0]\n" "addvl x26, x26, #16\n" "add x23, x23, #0x20\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15db019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z13.h[0]\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15db698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z13.h[1]\n" + "addvl x26, x26, #16\n" + ".inst 0xc15db719 // bfdot za.s[x9, 1], { z24.h-z27.h }, z13.h[1]\n" + ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15db918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z13.h[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xc15dba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z13.h[2]\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" - ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15dbc18 // bfdot za.s[x9, 0], { z0.h-z3.h }, z13.h[3]\n" "addvl x26, x26, #16\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" + ".inst 0xc15dbc99 // bfdot za.s[x9, 1], { z4.h-z7.h }, z13.h[3]\n" "bgt 15b\n" "16:" // Width 2: Multiply loop: Single iteration only - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "subs x21, x21, #0x2\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z15.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n" + "ld1rqw { z5.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aa8a5 // bfcvt z5.h, p2/M, z5.s\n" + "uzp1 z15.h, z15.h, z15.h\n" + "subs x22, x22, #0x2\n" + "uzp1 z5.h, z5.h, z5.h\n" + "trn1 z15.d, z15.d, z5.d\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x20\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" + ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n" "addvl x26, x26, #16\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" + ".inst 0xc15fb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z15.h[0]\n" "ble 17f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" + ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb798 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[1]\n" ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" + ".inst 0xc15fb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[1]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" - ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n" "addvl x26, x26, #16\n" "ble 17f\n" ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n" ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" + ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n" "addvl x26, x26, #16\n" "17:" // Width 2: Multiply loop: multiply skip "tbz %x[flags], #1, 18f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z29.s }, p2/Z, [x21]\n" + ".inst 0xc0062c14 // mova { z20.d-z23.d }, za.d[x9, #0]\n" + "ld1rw { z11.s }, p2/Z, [x21]\n" ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" - ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + ".inst 0xc1bcc974 // fclamp { z20.s-z23.s }, z11.s, z28.s\n" + ".inst 0xa060c734 // st1w { z20.s-z23.s }, pn9.b, [x25]\n" + ".inst 0xc1bcc96c // fclamp { z12.s-z15.s }, z11.s, z28.s\n" ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "b 19f\n" "18:" // Width 2: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xa061c32c // st1w { z12.s-z15.s }, p8, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n" + ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n" + ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n" + ".inst 0xa061c320 // st1w { z0.s-z3.s }, p8, [x25, #0x4, MUL VL]\n" "addvl x25, x25, #8\n" "19:" // Width 2: Output done "b 36f\n" "20:" // Width 3 "mov x20, #0x2\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "msub x20, x10, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 21f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" - ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n" + ".inst 0xa040c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042f80 // mova za.d[x9, #0], { z28.d-z31.d }\n" + ".inst 0xa041c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042c81 // mova za.d[x9, #1], { z4.d-z7.d }\n" + ".inst 0xa042c704 // ld1w { z4.s-z7.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n" "b 22f\n" "21:" // Width 3: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "22:" // Width 3: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 24f\n" "23:" // Width 3: Multiply loop: Main loop head - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "sub x21, x21, #0x8\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z14.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ce // bfcvt z14.h, p2/M, z14.s\n" + "ld1rqw { z16.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n" + "uzp1 z14.h, z14.h, z14.h\n" + "sub x22, x22, #0x8\n" + "uzp1 z16.h, z16.h, z16.h\n" + "trn1 z14.d, z14.d, z16.d\n" ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" - "cmp x21, #0x8\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" + "cmp x22, #0x8\n" + ".inst 0xa041a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15eb098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z14.h[0]\n" "add x23, x23, #0x20\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" + ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15eb319 // bfdot za.s[x9, 1], { z24.h-z27.h }, z14.h[0]\n" "addvl x26, x26, #16\n" - ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15eb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[0]\n" + ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n" ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" - ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" + ".inst 0xc15eb518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z14.h[1]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n" - ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15eb499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z14.h[1]\n" + "addvl x26, x26, #16\n" + ".inst 0xc15eb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[1]\n" + ".inst 0xa040a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15eb818 // bfdot za.s[x9, 0], { z0.h-z3.h }, z14.h[2]\n" + ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15ebb99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xc15eb81a // bfdot za.s[x9, 2], { z0.h-z3.h }, z14.h[2]\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15ebf18 // bfdot za.s[x9, 0], { z24.h-z27.h }, z14.h[3]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" + ".inst 0xc15ebf99 // bfdot za.s[x9, 1], { z28.h-z31.h }, z14.h[3]\n" "addvl x26, x26, #16\n" - ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n" + ".inst 0xc15ebe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z14.h[3]\n" "bgt 23b\n" "24:" // Width 3: Multiply loop: Single iteration only - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "subs x21, x21, #0x2\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" - "add x23, x23, #0x20\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n" - "ble 25f\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z15.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n" + "ld1rqw { z31.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n" + "uzp1 z15.h, z15.h, z15.h\n" + "subs x22, x22, #0x2\n" + "uzp1 z31.h, z31.h, z31.h\n" + "trn1 z15.d, z15.d, z31.d\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" - ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n" + "add x23, x23, #0x20\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb218 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[0]\n" + ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fb019 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[0]\n" "addvl x26, x26, #16\n" + ".inst 0xc15fb09a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[0]\n" "ble 25f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" + ".inst 0xa040a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb698 // bfdot za.s[x9, 0], { z20.h-z23.h }, z15.h[1]\n" ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" + ".inst 0xc15fb699 // bfdot za.s[x9, 1], { z20.h-z23.h }, z15.h[1]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n" + ".inst 0xc15fb61a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[1]\n" + "addvl x26, x26, #16\n" + "ble 25f\n" + ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb898 // bfdot za.s[x9, 0], { z4.h-z7.h }, z15.h[2]\n" + ".inst 0xa041a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb819 // bfdot za.s[x9, 1], { z0.h-z3.h }, z15.h[2]\n" + ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fbb1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z15.h[2]\n" "addvl x26, x26, #16\n" "ble 25f\n" ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15fbf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z15.h[3]\n" ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n" + ".inst 0xc15fbd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z15.h[3]\n" + ".inst 0xa042a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fbc9a // bfdot za.s[x9, 2], { z4.h-z7.h }, z15.h[3]\n" "addvl x26, x26, #16\n" "25:" // Width 3: Multiply loop: multiply skip "tbz %x[flags], #1, 26f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z29.s }, p2/Z, [x21]\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" - ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n" - ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n" - ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n" - ".inst 0xa062c324 // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" + "ld1rw { z17.s }, p2/Z, [x21]\n" + ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + ".inst 0xc1b0ca3c // fclamp { z28.s-z31.s }, z17.s, z16.s\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xa060c73c // st1w { z28.s-z31.s }, pn9.b, [x25]\n" + ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n" + ".inst 0xa061c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n" + ".inst 0xa062c32c // st1w { z12.s-z15.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "b 27f\n" "26:" // Width 3: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n" - ".inst 0xa062c324 // st1w { z4.s-z7.s }, p8, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c00 // mova { z0.d-z3.d }, za.d[x9, #0]\n" + ".inst 0xa060c720 // st1w { z0.s-z3.s }, pn9.b, [x25]\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n" + ".inst 0xa062c330 // st1w { z16.s-z19.s }, p8, [x25, #0x8, MUL VL]\n" "addvl x25, x25, #12\n" "27:" // Width 3: Output done "b 36f\n" "28:" // Width 4 "mov x20, #0x3\n" "mov x23, %x[A_ptr]\n" - "lsl x22, %x[K], #0x2\n" + "lsl x21, %x[K], #0x2\n" "msub x20, x10, x20, %x[N]\n" - "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" ".inst 0x25b467f0 // whilelt p8.s, XZR, x20, VLx4\n" "cbz x24, 29f\n" - ".inst 0xa040c700 // ld1w { z0.s-z3.s }, pn9.b/Z, [x24]\n" - ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" - ".inst 0xa041c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" - ".inst 0xa042c71c // ld1w { z28.s-z31.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n" - ".inst 0xa043c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n" - ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n" + ".inst 0xa040c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24]\n" + ".inst 0xc0042d80 // mova za.d[x9, #0], { z12.d-z15.d }\n" + ".inst 0xa041c70c // ld1w { z12.s-z15.s }, pn9.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n" + ".inst 0xa042c710 // ld1w { z16.s-z19.s }, pn9.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n" + ".inst 0xa043c714 // ld1w { z20.s-z23.s }, pn9.b/Z, [x24, #0xc, MUL VL]\n" + ".inst 0xc0042e83 // mova za.d[x9, #3], { z20.d-z23.d }\n" "addvl x24, x24, #16\n" "b 30f\n" "29:" // Width 4: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "30:" // Width 4: setup done - "cmp x21, #0x8\n" + "cmp x22, #0x8\n" "ble 32f\n" "31:" // Width 4: Multiply loop: Main loop head - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "sub x21, x21, #0x8\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" - "cmp x21, #0x8\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z6.s }, p1/Z, [x23]\n" + ".inst 0x658aa8c6 // bfcvt z6.h, p2/M, z6.s\n" + "ld1rqw { z16.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n" + "uzp1 z6.h, z6.h, z6.h\n" + "sub x22, x22, #0x8\n" + "uzp1 z16.h, z16.h, z16.h\n" + "trn1 z6.d, z6.d, z16.d\n" + ".inst 0xa040a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26]\n" + "cmp x22, #0x8\n" ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" + ".inst 0xc156b198 // bfdot za.s[x9, 0], { z12.h-z15.h }, z6.h[0]\n" "add x23, x23, #0x20\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" - ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n" + ".inst 0xc156b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z6.h[0]\n" + ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc156b19a // bfdot za.s[x9, 2], { z12.h-z15.h }, z6.h[0]\n" + "addvl x26, x26, #16\n" + ".inst 0xc156b21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[0]\n" + ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc156b518 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[1]\n" + ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc156b599 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[1]\n" + ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc156b41a // bfdot za.s[x9, 2], { z0.h-z3.h }, z6.h[1]\n" + "addvl x26, x26, #16\n" + ".inst 0xc156b69b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[1]\n" + ".inst 0xa040a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26]\n" + ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc156b918 // bfdot za.s[x9, 0], { z8.h-z11.h }, z6.h[2]\n" + ".inst 0xa042a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc156b999 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[2]\n" + ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc156b91a // bfdot za.s[x9, 2], { z8.h-z11.h }, z6.h[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xc156ba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z6.h[2]\n" ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n" - "addvl x26, x26, #16\n" - ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xa041a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc156bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z6.h[3]\n" + ".inst 0xa042a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc156bd99 // bfdot za.s[x9, 1], { z12.h-z15.h }, z6.h[3]\n" + ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc156bf1a // bfdot za.s[x9, 2], { z24.h-z27.h }, z6.h[3]\n" + "addvl x26, x26, #16\n" + ".inst 0xc156be1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z6.h[3]\n" "bgt 31b\n" "32:" // Width 4: Multiply loop: Single iteration only - "whilelt p1.s, XZR, x21\n" - "whilelt p0.s, x27, x21\n" - "ld1rqw { z0.s }, p1/Z, [x23]\n" - ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n" - "ld1rqw { z11.s }, p0/Z, [x23, #16]\n" - ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "subs x21, x21, #0x2\n" - "uzp1 z11.h, z11.h, z11.h\n" - "trn1 z0.d, z0.d, z11.d\n" - ".inst 0xa040a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26]\n" + "whilelt p1.s, XZR, x22\n" + "whilelt p0.s, x27, x22\n" + "ld1rqw { z15.s }, p1/Z, [x23]\n" + ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n" + "ld1rqw { z16.s }, p0/Z, [x23, #16]\n" + ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n" + "uzp1 z15.h, z15.h, z15.h\n" + "subs x22, x22, #0x2\n" + "uzp1 z16.h, z16.h, z16.h\n" + "trn1 z15.d, z15.d, z16.d\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" "add x23, x23, #0x20\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n" - ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n" + ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb318 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[0]\n" + ".inst 0xa042a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fb099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z15.h[0]\n" + ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15fb01a // bfdot za.s[x9, 2], { z0.h-z3.h }, z15.h[0]\n" "addvl x26, x26, #16\n" - ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n" + ".inst 0xc15fb21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[0]\n" "ble 33f\n" - ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n" - ".inst 0xa041a745 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n" - ".inst 0xa042a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n" + ".inst 0xa040a759 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x26]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fb718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z15.h[1]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fb619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[1]\n" + ".inst 0xa042a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc15fb69a // bfdot za.s[x9, 2], { z20.h-z23.h }, z15.h[1]\n" + ".inst 0xa043a741 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15fb41b // bfdot za.s[x9, 3], { z0.h-z3.h }, z15.h[1]\n" "addvl x26, x26, #16\n" "ble 33f\n" ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" - "subs x21, x21, #0x2\n" - ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n" - ".inst 0xa041a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n" + "subs x22, x22, #0x2\n" + ".inst 0xc15fba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[2]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[2]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n" - ".inst 0xa043a74d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n" + ".inst 0xc15fba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[2]\n" + ".inst 0xa043a755 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15fba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z15.h[2]\n" "addvl x26, x26, #16\n" "ble 33f\n" - ".inst 0xa040a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26]\n" - ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n" - ".inst 0xa041a749 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n" + ".inst 0xa040a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26]\n" + ".inst 0xc15fbe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z15.h[3]\n" + ".inst 0xa041a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc15fbe19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z15.h[3]\n" ".inst 0xa042a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n" - ".inst 0xa043a75d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n" + ".inst 0xc15fbe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z15.h[3]\n" + ".inst 0xa043a751 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc15fbe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z15.h[3]\n" "addvl x26, x26, #16\n" "33:" // Width 4: Multiply loop: multiply skip "tbz %x[flags], #1, 34f\n" "add x21, %x[args_ptr], %[offset_min]\n" "add x20, %x[args_ptr], %[offset_max]\n" - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - "ld1rw { z29.s }, p2/Z, [x21]\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - "ld1rw { z18.s }, p2/Z, [x20]\n" - ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n" - ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n" - ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n" - ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n" - ".inst 0xa062c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc1b2cba0 // fclamp { z0.s-z3.s }, z29.s, z18.s\n" - ".inst 0xa063c320 // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + "ld1rw { z21.s }, p2/Z, [x21]\n" + ".inst 0xc0062c38 // mova { z24.d-z27.d }, za.d[x9, #1]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n" + ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" + ".inst 0xa060c72c // st1w { z12.s-z15.s }, pn9.b, [x25]\n" + ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc0062c70 // mova { z16.d-z19.d }, za.d[x9, #3]\n" + ".inst 0xa061c738 // st1w { z24.s-z27.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n" + ".inst 0xa062c720 // st1w { z0.s-z3.s }, pn9.b, [x25, #0x8, MUL VL]\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" + ".inst 0xa063c330 // st1w { z16.s-z19.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "b 35f\n" "34:" // Width 4: No activation - ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" - ".inst 0xa060c728 // st1w { z8.s-z11.s }, pn9.b, [x25]\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xa061c72c // st1w { z12.s-z15.s }, pn9.b, [x25, #0x4, MUL VL]\n" - ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n" - ".inst 0xa062c724 // st1w { z4.s-z7.s }, pn9.b, [x25, #0x8, MUL VL]\n" - ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n" - ".inst 0xa063c320 // st1w { z0.s-z3.s }, p8, [x25, #0xc, MUL VL]\n" + ".inst 0xc0062c10 // mova { z16.d-z19.d }, za.d[x9, #0]\n" + ".inst 0xa060c730 // st1w { z16.s-z19.s }, pn9.b, [x25]\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xa061c730 // st1w { z16.s-z19.s }, pn9.b, [x25, #0x4, MUL VL]\n" + ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n" + ".inst 0xa062c734 // st1w { z20.s-z23.s }, pn9.b, [x25, #0x8, MUL VL]\n" + ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n" + ".inst 0xa063c338 // st1w { z24.s-z27.s }, p8, [x25, #0xc, MUL VL]\n" "addvl x25, x25, #16\n" "35:" // Width 4: Output done "subs x28, x28, #0x4\n" @@ -598,7 +598,7 @@ void sme2_gemv_fp32bf16fp32_dot_16VL ( "bgt 4b\n" "36:" // Exit ".inst 0xd503467f // SMSTOP\n" - "ptrue p2.b\n" + "ptrue p8.b\n" : [N] "+&r" (N) : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -607,5 +607,4 @@ void sme2_gemv_fp32bf16fp32_dot_16VL ( } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SME2 -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp index 4c9f9cff9a..65e4667f88 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,19 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - -#ifdef __aarch64__ +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "../std_transforms_sme.hpp" #define ARGLIST \ @@ -83,4 +82,4 @@ public: #undef ARGLIST -#endif // __aarch64__ +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp index 26dc0b9dd2..86bd8aeb04 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE -#ifdef ARM_COMPUTE_ENABLE_SME2 + +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "arm_gemm.hpp" #include "../../utils.hpp" @@ -35,11 +35,9 @@ namespace arm_gemm { void sme2_gemv_s8qa_dot_16VL ( const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr, size_t N, size_t K, - const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base + const Requantize32 *qp, const int32_t *col_bias, unsigned int ) { - ARM_COMPUTE_UNUSED(col_base); - struct KernelArgs { const int8_t *B_ptr = {}; size_t output_offset = {}; @@ -52,7 +50,7 @@ void sme2_gemv_s8qa_dot_16VL ( flags |= 0x20; } __asm__ __volatile__( - "ptrue p2.b\n" + "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" "cntw x28, ALL, MUL #4\n" "add x27, %x[N], x28\n" @@ -84,8 +82,8 @@ void sme2_gemv_s8qa_dot_16VL ( ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n" "3:" // RHS prefetch exit "mov x24, %x[col_bias]\n" - "mov z26.s, #0x0\n" - "mov z24.b, #0x1\n" + "mov z28.s, #0x0\n" + "mov z29.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "4:" // Column loop "cmp x27, #0x4\n" @@ -94,404 +92,404 @@ void sme2_gemv_s8qa_dot_16VL ( "bgt 24f\n" "beq 14f\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "mov x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 5f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" "b 6f\n" "5:" // Width 1: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "6:" // Width 1: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 9f\n" "7:" // Width 1: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - "addvl x26, x26, #16\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" + ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 8f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "8:" // Width 1: Multiply loop: unique 1: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 7b\n" "9:" // Width 1: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" + ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 10f\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 10f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 10f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bd20 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "10:" // Width 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "11:" // Width 1: Multiply loop: unique 2: skip row sum "tbnz %x[flags], #31, 12f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z26.s }, p2/Z, [x21]\n" + "neg z26.s, p2/M, z26.s\n" "whilelt p0.s, XZR, x20\n" - "saddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "saddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z26.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "12:" // Width 1: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z7.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x25]\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + "ld1rw { z30.s }, p2/Z, [x20]\n" + ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" + ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "uzp1 z19.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z19.b\n" + "st1b { z12.b }, p1, [x25]\n" "addvl x25, x25, #1\n" "13:" // Width 1: Output done "b 44f\n" "14:" // Width 2 "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "sub x20, %x[N], x28\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "sub x20, %x[N], x28\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 15f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" + ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n" + ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n" "b 16f\n" "15:" // Width 2: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "16:" // Width 2: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 19f\n" "17:" // Width 2: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" + ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b6a1 // sdot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bca0 // sdot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n" + ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 18f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "18:" // Width 2: Multiply loop: unique 3: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 17b\n" "19:" // Width 2: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b320 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 20f\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 20f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b9a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 20f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bd21 // sdot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "20:" // Width 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 21f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "21:" // Width 2: Multiply loop: unique 4: skip row sum "tbnz %x[flags], #31, 22f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "saddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "saddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "22:" // Width 2: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z9.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x25, #1, MUL VL]\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n" + ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" + ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n" + ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" + ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" + ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n" + ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n" + ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n" + ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "uzp1 z9.h, z26.h, z27.h\n" + "uzp1 z0.h, z0.h, z1.h\n" + "uzp1 z26.h, z2.h, z3.h\n" + "uzp1 z24.b, z24.b, z9.b\n" + "st1b { z24.b }, p2, [x25]\n" + "uzp1 z0.b, z0.b, z26.b\n" + "st1b { z0.b }, p1, [x25, #1, MUL VL]\n" "addvl x25, x25, #2\n" "23:" // Width 2: Output done "b 44f\n" "24:" // Width 3 "mov x20, #0x2\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "msub x20, x28, x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "msub x20, x28, x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 25f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" - ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n" + ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n" + ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n" + ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n" "b 26f\n" "25:" // Width 3: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "26:" // Width 3: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 29f\n" "27:" // Width 3: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b5a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b6a2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b920 // sdot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n" ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" + ".inst 0xc151b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bf20 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bca1 // sdot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 28f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "28:" // Width 3: Multiply loop: unique 5: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 27b\n" "29:" // Width 3: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b2a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b222 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b720 // sdot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" + ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151baa2 // sdot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n" + ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151bda2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "30:" // Width 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 31f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "31:" // Width 3: Multiply loop: unique 6: skip row sum "tbnz %x[flags], #31, 32f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "saddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "saddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "32:" // Width 3: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" - ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n" - "uzp1 z29.h, z30.h, z31.h\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" + ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" + ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" + ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" + ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n" + ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n" + ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n" + ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n" + ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n" + "uzp1 z18.h, z10.h, z11.h\n" + "uzp1 z4.h, z4.h, z5.h\n" + "uzp1 z17.h, z6.h, z7.h\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z0.h, z0.h, z1.h\n" - "uzp1 z1.h, z2.h, z3.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" - "uzp1 z0.b, z0.b, z1.b\n" - "st1b { z0.b }, p1, [x25, #2, MUL VL]\n" + "uzp1 z16.h, z14.h, z15.h\n" + "uzp1 z8.b, z8.b, z18.b\n" + "st1b { z8.b }, p2, [x25]\n" + "uzp1 z4.b, z4.b, z17.b\n" + "st1b { z4.b }, p2, [x25, #1, MUL VL]\n" + "uzp1 z12.b, z12.b, z16.b\n" + "st1b { z12.b }, p1, [x25, #2, MUL VL]\n" "addvl x25, x25, #3\n" "33:" // Width 3: Output done "b 44f\n" "34:" // Width 4 "mov x20, #0x3\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "msub x20, x28, x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "msub x20, x28, x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 35f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" - ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n" + ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n" + ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" + ".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n" ".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n" ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n" "addvl x24, x24, #16\n" @@ -499,165 +497,165 @@ void sme2_gemv_s8qa_dot_16VL ( "35:" // Width 4: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "36:" // Width 4: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 39f\n" "37:" // Width 4: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b221 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n" ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n" - "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" - ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n" + ".inst 0xc151b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b623 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9a1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bda0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bda1 // sdot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151bf22 // sdot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151bca3 // sdot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 38f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "38:" // Width 4: Multiply loop: unique 7: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 37b\n" "39:" // Width 4: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1a0 // sdot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" + ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b321 // sdot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n" + ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b122 // sdot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b223 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b620 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b621 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b5a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b6a3 // sdot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" - ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151ba20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151ba21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151ba22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151ba23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151be20 // sdot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151be21 // sdot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n" ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "40:" // Width 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 41f\n" - "sdot z26.s, z3.b, z24.b\n" + "sdot z28.s, z1.b, z29.b\n" "41:" // Width 4: Multiply loop: unique 8: skip row sum "tbnz %x[flags], #31, 42f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "saddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "saddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "42:" // Width 4: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z11.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z7.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" "ld1rw { z6.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" - ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" - ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n" - ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n" - ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z3.s }, p2/Z, [x21]\n" + ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n" + ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" + ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n" + ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n" + ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n" + ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" + ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n" + "ld1rw { z31.s }, p2/Z, [x20]\n" + ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" + ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n" + ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" + ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" + ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n" ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n" - ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n" - "uzp1 z29.h, z30.h, z31.h\n" + ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n" + ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n" + ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z16.h, z16.h, z17.h\n" + "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z20.h, z20.h, z21.h\n" + "uzp1 z17.h, z22.h, z23.h\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z0.h, z0.h, z1.h\n" - "uzp1 z1.h, z2.h, z3.h\n" - "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" - "uzp1 z0.b, z0.b, z1.b\n" - "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z0.b }, p2, [x25, #2, MUL VL]\n" - "st1b { z8.b }, p1, [x25, #3, MUL VL]\n" + "uzp1 z30.h, z14.h, z15.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p2, [x25]\n" + "uzp1 z16.b, z16.b, z18.b\n" + "st1b { z16.b }, p2, [x25, #1, MUL VL]\n" + "uzp1 z20.b, z20.b, z17.b\n" + "uzp1 z12.b, z12.b, z30.b\n" + "st1b { z20.b }, p2, [x25, #2, MUL VL]\n" + "st1b { z12.b }, p1, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" "43:" // Width 4: Output done "subs x27, x27, #0x4\n" @@ -665,7 +663,7 @@ void sme2_gemv_s8qa_dot_16VL ( "bgt 4b\n" "44:" // Exit ".inst 0xd503467f // SMSTOP\n" - "ptrue p2.b\n" + "ptrue p8.b\n" : [N] "+&r" (N), [flags] "+&r" (flags) : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -674,5 +672,4 @@ void sme2_gemv_s8qa_dot_16VL ( } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SME2 -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp index e15b95445e..46d8c4439b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,19 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - -#ifdef __aarch64__ +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "../std_transforms_sme.hpp" #define ARGLIST \ @@ -83,4 +82,4 @@ public: #undef ARGLIST -#endif // __aarch64__ +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp index dfdc4ea289..093feee6ce 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE -#ifdef ARM_COMPUTE_ENABLE_SME2 + +#if defined(ARM_COMPUTE_ENABLE_SME2) #include "arm_gemm.hpp" #include "../../utils.hpp" @@ -35,11 +35,9 @@ namespace arm_gemm { void sme2_gemv_u8qa_dot_16VL ( const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr, size_t N, size_t K, - const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base + const Requantize32 *qp, const int32_t *col_bias, unsigned int ) { - ARM_COMPUTE_UNUSED(col_base); - struct KernelArgs { const uint8_t *B_ptr = {}; size_t output_offset = {}; @@ -52,7 +50,7 @@ void sme2_gemv_u8qa_dot_16VL ( flags |= 0x20; } __asm__ __volatile__( - "ptrue p2.b\n" + "ptrue p8.b\n" ".inst 0xd503477f // SMSTART ZA\n" "cntw x28, ALL, MUL #4\n" "add x27, %x[N], x28\n" @@ -84,8 +82,8 @@ void sme2_gemv_u8qa_dot_16VL ( ".inst 0xf8b64b5a // rprfm pldonce, x22, [x26]\n" "3:" // RHS prefetch exit "mov x24, %x[col_bias]\n" - "mov z26.s, #0x0\n" - "mov z24.b, #0x1\n" + "mov z28.s, #0x0\n" + "mov z29.b, #0x1\n" "bic %x[flags], %x[flags], #0x80000000\n" "4:" // Column loop "cmp x27, #0x4\n" @@ -94,404 +92,404 @@ void sme2_gemv_u8qa_dot_16VL ( "bgt 24f\n" "beq 14f\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "mov x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "mov x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 5f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" + ".inst 0xa040c300 // ld1w { z0.s-z3.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n" "b 6f\n" "5:" // Width 1: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "6:" // Width 1: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 9f\n" "7:" // Width 1: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - "addvl x26, x26, #16\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" + ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 8f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "8:" // Width 1: Multiply loop: unique 1: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 7b\n" "9:" // Width 1: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" + ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 10f\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 10f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 10f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bd30 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "10:" // Width 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "11:" // Width 1: Multiply loop: unique 2: skip row sum "tbnz %x[flags], #31, 12f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z26.s }, p2/Z, [x21]\n" + "neg z26.s, p2/M, z26.s\n" "whilelt p0.s, XZR, x20\n" - "uaddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "uaddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z26.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "12:" // Width 1: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z7.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x25]\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c0c // mova { z12.d-z15.d }, za.d[x9, #0]\n" + ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + "ld1rw { z30.s }, p2/Z, [x20]\n" + ".inst 0xc1a2ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" + ".inst 0xc1bece0c // sclamp { z12.s-z15.s }, z16.s, z30.s\n" + "uzp1 z12.h, z12.h, z13.h\n" + "uzp1 z19.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z19.b\n" + "st1b { z12.b }, p1, [x25]\n" "addvl x25, x25, #1\n" "13:" // Width 1: Output done "b 44f\n" "14:" // Width 2 "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "sub x20, %x[N], x28\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "sub x20, %x[N], x28\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 15f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" + ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n" + ".inst 0xa041c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042f01 // mova za.d[x9, #1], { z24.d-z27.d }\n" "b 16f\n" "15:" // Width 2: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "16:" // Width 2: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 19f\n" "17:" // Width 2: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" + ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b6b1 // udot za.s[x9, 1], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bcb0 // udot za.s[x9, 0], { z4.b-z7.b }, z1.b[3]\n" + ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 18f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "18:" // Width 2: Multiply loop: unique 3: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 17b\n" "19:" // Width 2: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b330 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 20f\n" ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 20f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b9b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 20f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bd31 // udot za.s[x9, 1], { z8.b-z11.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "20:" // Width 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 21f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "21:" // Width 2: Multiply loop: unique 4: skip row sum "tbnz %x[flags], #31, 22f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "uaddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "uaddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "22:" // Width 2: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z6.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z5.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z9.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x25, #1, MUL VL]\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n" + ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" + ".inst 0xc0062c20 // mova { z0.d-z3.d }, za.d[x9, #1]\n" + ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" + ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + ".inst 0xc1a5aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" + ".inst 0xc1a9ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n" + ".inst 0xc1a9ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n" + ".inst 0xc1b5ce18 // sclamp { z24.s-z27.s }, z16.s, z21.s\n" + ".inst 0xc1b5ce00 // sclamp { z0.s-z3.s }, z16.s, z21.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + "uzp1 z9.h, z26.h, z27.h\n" + "uzp1 z0.h, z0.h, z1.h\n" + "uzp1 z26.h, z2.h, z3.h\n" + "uzp1 z24.b, z24.b, z9.b\n" + "st1b { z24.b }, p2, [x25]\n" + "uzp1 z0.b, z0.b, z26.b\n" + "st1b { z0.b }, p1, [x25, #1, MUL VL]\n" "addvl x25, x25, #2\n" "23:" // Width 2: Output done "b 44f\n" "24:" // Width 3 "mov x20, #0x2\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "msub x20, x28, x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "msub x20, x28, x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 25f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" - ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n" + ".inst 0xa040c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e00 // mova za.d[x9, #0], { z16.d-z19.d }\n" + ".inst 0xa041c30c // ld1w { z12.s-z15.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042d81 // mova za.d[x9, #1], { z12.d-z15.d }\n" + ".inst 0xa042c318 // ld1w { z24.s-z27.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042f02 // mova za.d[x9, #2], { z24.d-z27.d }\n" "b 26f\n" "25:" // Width 3: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "26:" // Width 3: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 29f\n" "27:" // Width 3: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b5b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b6b2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" + ".inst 0xa0408349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b930 // udot za.s[x9, 0], { z8.b-z11.b }, z1.b[2]\n" ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" + ".inst 0xc151b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[2]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bf30 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bcb1 // udot za.s[x9, 1], { z4.b-z7.b }, z1.b[3]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 28f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "28:" // Width 3: Multiply loop: unique 5: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 27b\n" "29:" // Width 3: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b2b0 // udot za.s[x9, 0], { z20.b-z23.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b232 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b730 // udot za.s[x9, 0], { z24.b-z27.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" + ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0428355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151bab2 // udot za.s[x9, 2], { z20.b-z23.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 30f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n" + ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151bdb2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "30:" // Width 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 31f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "31:" // Width 3: Multiply loop: unique 6: skip row sum "tbnz %x[flags], #31, 32f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "uaddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "uaddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "32:" // Width 3: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" + "ld1rw { z3.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" - ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n" - "uzp1 z29.h, z30.h, z31.h\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n" + ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" + ".inst 0xc0062c24 // mova { z4.d-z7.d }, za.d[x9, #1]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc0062c4c // mova { z12.d-z15.d }, za.d[x9, #2]\n" + ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" + ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" + ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a3ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n" + ".inst 0xc1a3ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n" + ".inst 0xc1a3ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n" + ".inst 0xc1a0ce08 // sclamp { z8.s-z11.s }, z16.s, z0.s\n" + ".inst 0xc1a0ce04 // sclamp { z4.s-z7.s }, z16.s, z0.s\n" + "uzp1 z8.h, z8.h, z9.h\n" + ".inst 0xc1a0ce0c // sclamp { z12.s-z15.s }, z16.s, z0.s\n" + "uzp1 z18.h, z10.h, z11.h\n" + "uzp1 z4.h, z4.h, z5.h\n" + "uzp1 z17.h, z6.h, z7.h\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z0.h, z0.h, z1.h\n" - "uzp1 z1.h, z2.h, z3.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" - "uzp1 z0.b, z0.b, z1.b\n" - "st1b { z0.b }, p1, [x25, #2, MUL VL]\n" + "uzp1 z16.h, z14.h, z15.h\n" + "uzp1 z8.b, z8.b, z18.b\n" + "st1b { z8.b }, p2, [x25]\n" + "uzp1 z4.b, z4.b, z17.b\n" + "st1b { z4.b }, p2, [x25, #1, MUL VL]\n" + "uzp1 z12.b, z12.b, z16.b\n" + "st1b { z12.b }, p1, [x25, #2, MUL VL]\n" "addvl x25, x25, #3\n" "33:" // Width 3: Output done "b 44f\n" "34:" // Width 4 "mov x20, #0x3\n" "mov x23, %x[A_ptr]\n" - "mov x22, %x[K]\n" - "msub x20, x28, x20, %x[N]\n" "mov x21, %x[K]\n" - ".inst 0xf8b64af8 // rprfm pldmany, x22, [x23]\n" + "msub x20, x28, x20, %x[N]\n" + "mov x22, %x[K]\n" + ".inst 0xf8b54af8 // rprfm pldmany, x21, [x23]\n" "whilelt p1.b, XZR, x20\n" "cbz x24, 35f\n" - ".inst 0xa040c304 // ld1w { z4.s-z7.s }, pn8.b/Z, [x24]\n" - ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n" - ".inst 0xa041c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" - ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n" - ".inst 0xa042c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" - ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n" + ".inst 0xa040c314 // ld1w { z20.s-z23.s }, pn8.b/Z, [x24]\n" + ".inst 0xc0042e80 // mova za.d[x9, #0], { z20.d-z23.d }\n" + ".inst 0xa041c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x4, MUL VL]\n" + ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n" + ".inst 0xa042c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0x8, MUL VL]\n" + ".inst 0xc0042e02 // mova za.d[x9, #2], { z16.d-z19.d }\n" ".inst 0xa043c310 // ld1w { z16.s-z19.s }, pn8.b/Z, [x24, #0xc, MUL VL]\n" ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n" "addvl x24, x24, #16\n" @@ -499,165 +497,165 @@ void sme2_gemv_u8qa_dot_16VL ( "35:" // Width 4: no bias ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "36:" // Width 4: setup done - "cmp x21, #0x10\n" + "cmp x22, #0x10\n" "ble 39f\n" "37:" // Width 4: Multiply loop: Main loop head - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x23, x23, #0x10\n" ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" + ".inst 0xc151b230 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[0]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b231 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[0]\n" ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[0]\n" ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z1.b[0]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b5b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[1]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n" - "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" - ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n" + ".inst 0xc151b632 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b633 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[1]\n" "addvl x26, x26, #16\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b9b1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[2]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n" + "addvl x26, x26, #16\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151bdb0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa041834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151bdb1 // udot za.s[x9, 1], { z12.b-z15.b }, z1.b[3]\n" + ".inst 0xa0428359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151bf32 // udot za.s[x9, 2], { z24.b-z27.b }, z1.b[3]\n" + ".inst 0xa0438345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151bcb3 // udot za.s[x9, 3], { z4.b-z7.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "tbnz %x[flags], #31, 38f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "38:" // Width 4: Multiply loop: unique 7: skip row sum - "sub x21, x21, #0x10\n" - "cmp x21, #0x10\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" "bgt 37b\n" "39:" // Width 4: Multiply loop: Single iteration only - "whilelt p0.b, XZR, x21\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" - "subs x21, x21, #0x4\n" - ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "whilelt p0.b, XZR, x22\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "subs x22, x22, #0x4\n" + ".inst 0xa040834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26]\n" "add x23, x23, #0x10\n" - ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n" + ".inst 0xc151b1b0 // udot za.s[x9, 0], { z12.b-z15.b }, z1.b[0]\n" + ".inst 0xa0418359 // ldnt1b { z24.b-z27.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b331 // udot za.s[x9, 1], { z24.b-z27.b }, z1.b[0]\n" + ".inst 0xa0428349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b132 // udot za.s[x9, 2], { z8.b-z11.b }, z1.b[0]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b233 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[0]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n" - ".inst 0xa0418349 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n" - ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n" - ".inst 0xa043834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151b630 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151b631 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[1]\n" + ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151b5b2 // udot za.s[x9, 2], { z12.b-z15.b }, z1.b[1]\n" + ".inst 0xa0438355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151b6b3 // udot za.s[x9, 3], { z20.b-z23.b }, z1.b[1]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - "subs x21, x21, #0x4\n" - ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n" - ".inst 0xa0418345 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n" - ".inst 0xa042834d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n" - ".inst 0xa043835d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + "subs x22, x22, #0x4\n" + ".inst 0xc151ba30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151ba31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" + ".inst 0xc151ba32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[2]\n" + ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" + ".inst 0xc151ba33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[2]\n" "addvl x26, x26, #16\n" "ble 40f\n" - ".inst 0xa0408355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26]\n" - ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n" - ".inst 0xa0418355 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" - ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n" + ".inst 0xa0408351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26]\n" + ".inst 0xc151be30 // udot za.s[x9, 0], { z16.b-z19.b }, z1.b[3]\n" + ".inst 0xa0418351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n" + ".inst 0xc151be31 // udot za.s[x9, 1], { z16.b-z19.b }, z1.b[3]\n" ".inst 0xa0428351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n" - ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be32 // udot za.s[x9, 2], { z16.b-z19.b }, z1.b[3]\n" ".inst 0xa0438351 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n" - ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n" + ".inst 0xc151be33 // udot za.s[x9, 3], { z16.b-z19.b }, z1.b[3]\n" "addvl x26, x26, #16\n" "40:" // Width 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 41f\n" - "udot z26.s, z3.b, z24.b\n" + "udot z28.s, z1.b, z29.b\n" "41:" // Width 4: Multiply loop: unique 8: skip row sum "tbnz %x[flags], #31, 42f\n" "add x21, %x[qp], %[b_offset]\n" "mov x20, #0x4\n" - "ld1rw { z10.s }, p2/Z, [x21]\n" - "neg z10.s, p2/M, z10.s\n" + "ld1rw { z16.s }, p2/Z, [x21]\n" + "neg z16.s, p2/M, z16.s\n" "whilelt p0.s, XZR, x20\n" - "uaddv d26, p0, z26.s\n" - "mov z26.s, z26.s[0]\n" - "mul z26.s, p2/M, z26.s, z10.s\n" + "uaddv d28, p0, z28.s\n" + "mov z28.s, z28.s[0]\n" + "mul z28.s, p2/M, z28.s, z16.s\n" "orr %x[flags], %x[flags], #0x80000000\n" "42:" // Width 4: skip row sum fixup - ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b80 // addha za0.s, p2/M, p2/M, z28.s\n" "add x20, %x[qp], %[per_layer_mul]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" + "ld1rw { z11.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + ".inst 0xc0904b81 // addha za1.s, p2/M, p2/M, z28.s\n" + "ld1rw { z7.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[c_offset]\n" "add x21, %x[qp], %[minval]\n" - ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n" + ".inst 0xc0904b82 // addha za2.s, p2/M, p2/M, z28.s\n" "ld1rw { z6.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[maxval]\n" - ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n" - "ld1rw { z21.s }, p2/Z, [x21]\n" - ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n" - ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n" - ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n" - ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n" - ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n" - ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n" - ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n" - ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n" - ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n" - "ld1rw { z16.s }, p2/Z, [x20]\n" - ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n" - ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n" - ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" + ".inst 0xc0904b83 // addha za3.s, p2/M, p2/M, z28.s\n" + "ld1rw { z3.s }, p2/Z, [x21]\n" + ".inst 0xc0062c18 // mova { z24.d-z27.d }, za.d[x9, #0]\n" + ".inst 0xc1abac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n" + ".inst 0xc0062c30 // mova { z16.d-z19.d }, za.d[x9, #1]\n" + ".inst 0xc1abac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" + ".inst 0xc0062c54 // mova { z20.d-z23.d }, za.d[x9, #2]\n" + ".inst 0xc1abac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n" + ".inst 0xc0062c6c // mova { z12.d-z15.d }, za.d[x9, #3]\n" + ".inst 0xc1abac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" + ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n" + "ld1rw { z31.s }, p2/Z, [x20]\n" + ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" + ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n" + ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" + ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" + ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n" ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n" - ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n" - ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n" - "uzp1 z28.h, z28.h, z29.h\n" - ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n" - ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n" - "uzp1 z29.h, z30.h, z31.h\n" + ".inst 0xc1bfcc78 // sclamp { z24.s-z27.s }, z3.s, z31.s\n" + ".inst 0xc1bfcc70 // sclamp { z16.s-z19.s }, z3.s, z31.s\n" + "uzp1 z24.h, z24.h, z25.h\n" + ".inst 0xc1bfcc74 // sclamp { z20.s-z23.s }, z3.s, z31.s\n" + ".inst 0xc1bfcc6c // sclamp { z12.s-z15.s }, z3.s, z31.s\n" + "uzp1 z25.h, z26.h, z27.h\n" + "uzp1 z16.h, z16.h, z17.h\n" + "uzp1 z18.h, z18.h, z19.h\n" + "uzp1 z20.h, z20.h, z21.h\n" + "uzp1 z17.h, z22.h, z23.h\n" "uzp1 z12.h, z12.h, z13.h\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z0.h, z0.h, z1.h\n" - "uzp1 z1.h, z2.h, z3.h\n" - "uzp1 z8.h, z8.h, z9.h\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p2, [x25]\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p2, [x25, #1, MUL VL]\n" - "uzp1 z0.b, z0.b, z1.b\n" - "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z0.b }, p2, [x25, #2, MUL VL]\n" - "st1b { z8.b }, p1, [x25, #3, MUL VL]\n" + "uzp1 z30.h, z14.h, z15.h\n" + "uzp1 z24.b, z24.b, z25.b\n" + "st1b { z24.b }, p2, [x25]\n" + "uzp1 z16.b, z16.b, z18.b\n" + "st1b { z16.b }, p2, [x25, #1, MUL VL]\n" + "uzp1 z20.b, z20.b, z17.b\n" + "uzp1 z12.b, z12.b, z30.b\n" + "st1b { z20.b }, p2, [x25, #2, MUL VL]\n" + "st1b { z12.b }, p1, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" "43:" // Width 4: Output done "subs x27, x27, #0x4\n" @@ -665,7 +663,7 @@ void sme2_gemv_u8qa_dot_16VL ( "bgt 4b\n" "44:" // Exit ".inst 0xd503467f // SMSTOP\n" - "ptrue p2.b\n" + "ptrue p8.b\n" : [N] "+&r" (N), [flags] "+&r" (flags) : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -674,5 +672,4 @@ void sme2_gemv_u8qa_dot_16VL ( } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE +#endif // defined(ARM_COMPUTE_ENABLE_SME2) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp index 37eb63d898..edfb362aab 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../bfloat.hpp" #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp index c6eb858ade..8105300cb7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,12 +112,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" + ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z21.s, #1.0\n" + "fmov z6.s, #1.0\n" ".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n" - ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n" - ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n" - ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n" - ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n" + ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n" + ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n" + ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n" + ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x9\n" "mov x21, x10\n" @@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "madd x23, x9, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - "ld1h { z0.h }, p0/Z, [x26]\n" - ".inst 0xa140a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n" - "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0xa141a6ea // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n" - ".inst 0xa142a6eb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n" + "ld1h { z28.h }, p0/Z, [x26]\n" + ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n" + "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n" + ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n" + ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n" "addvl x26, x26, #4\n" - ".inst 0xa143a6f8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "ble 7f\n" "6:" // K loop - ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n" + ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n" "subs x22, x22, #0x1\n" - ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n" - ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n" - ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n" - "ld1h { z0.h }, p0/Z, [x26]\n" - ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n" - ".inst 0xa140a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n" - ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n" - ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n" - ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n" - "ld1h { z13.h }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n" - ".inst 0xa141a6ea // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n" - ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n" - ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n" - "ld1h { z12.h }, p0/Z, [x26, #2, MUL VL]\n" - ".inst 0xa142a6eb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n" - ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n" - ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n" - ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n" - "ld1h { z26.h }, p0/Z, [x26, #3, MUL VL]\n" + ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n" + ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n" + ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n" + "ld1h { z28.h }, p0/Z, [x26]\n" + ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n" + ".inst 0xa040a6e9 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x23]\n" + ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n" + ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n" + ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n" + "ld1h { z22.h }, p0/Z, [x26, #1, MUL VL]\n" + ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n" + ".inst 0xa041a6ed // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n" + ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n" + ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n" + "ld1h { z30.h }, p0/Z, [x26, #2, MUL VL]\n" + ".inst 0xa042a6e5 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + ".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n" + ".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n" + ".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n" + ".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n" + "ld1h { z20.h }, p0/Z, [x26, #3, MUL VL]\n" "addvl x26, x26, #4\n" - ".inst 0xa143a6f8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa143a6fb // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n" - ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n" - ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n" - ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n" - ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n" - ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n" - ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n" - ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n" - ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n" - ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n" - ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n" - ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n" - ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n" - ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n" - ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n" - ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n" + ".inst 0x81880380 // bfmopa za0.s, p0/M, p0/M, z28.h, z8.h\n" + ".inst 0x81890381 // bfmopa za1.s, p0/M, p0/M, z28.h, z9.h\n" + ".inst 0x818a0382 // bfmopa za2.s, p0/M, p0/M, z28.h, z10.h\n" + ".inst 0x818b0383 // bfmopa za3.s, p0/M, p0/M, z28.h, z11.h\n" + ".inst 0x818c02c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z12.h\n" + ".inst 0x818d02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z13.h\n" + ".inst 0x818e02c2 // bfmopa za2.s, p0/M, p0/M, z22.h, z14.h\n" + ".inst 0x818f02c3 // bfmopa za3.s, p0/M, p0/M, z22.h, z15.h\n" + ".inst 0x818403c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z4.h\n" + ".inst 0x818503c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z5.h\n" + ".inst 0x818603c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z6.h\n" + ".inst 0x818703c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z7.h\n" + ".inst 0x81930280 // bfmopa za0.s, p0/M, p0/M, z20.h, z19.h\n" + ".inst 0x81970281 // bfmopa za1.s, p0/M, p0/M, z20.h, z23.h\n" + ".inst 0x819b0282 // bfmopa za2.s, p0/M, p0/M, z20.h, z27.h\n" + ".inst 0x819f0283 // bfmopa za3.s, p0/M, p0/M, z20.h, z31.h\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - "ld1h { z0.h }, p0/Z, [x26]\n" + "ld1h { z8.h }, p0/Z, [x26]\n" "subs x21, x21, #0x1\n" "addvl x26, x26, #1\n" - ".inst 0xa140a6f3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x23]\n" + ".inst 0xa140a6e3 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x23]\n" "addvl x23, x23, #4\n" - ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n" - ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n" - ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n" - ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n" + ".inst 0x81830100 // bfmopa za0.s, p0/M, p0/M, z8.h, z3.h\n" + ".inst 0x81870101 // bfmopa za1.s, p0/M, p0/M, z8.h, z7.h\n" + ".inst 0x818b0102 // bfmopa za2.s, p0/M, p0/M, z8.h, z11.h\n" + ".inst 0x818f0103 // bfmopa za3.s, p0/M, p0/M, z8.h, z15.h\n" "bgt 9b\n" "10:" // K oddments: End "tbz x15, #1, 14f\n" @@ -242,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" - ".inst 0xa041c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" - ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa042c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" - ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" + ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" + ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13]\n" + ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n" "addvl x14, x14, #16\n" - ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n" - ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n" - ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n" + ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n" + ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n" + ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n" "addvl x13, x13, #16\n" "blt 11b\n" "b 24f\n" @@ -268,15 +267,15 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n" - ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n" + ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" - ".inst 0xa061c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n" + ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n" + ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n" ".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n" "addvl x13, x13, #16\n" "blt 13b\n" @@ -314,18 +313,18 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "16:" // Store to output array: Skip activation: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 17f\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "add x25, x25, x23\n" "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End "subs x24, x24, x22\n" @@ -334,66 +333,66 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con "18:" // Store to output array: Skip activation: End "cntw x20\n" "cmp x24, x20\n" - "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x20, x24, x20, LT\n" "lsr x21, x20, #0x2\n" - "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x20, #0x3\n" "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n" - ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n" - ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "add x12, x12, #0x4\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "cmp x12, x21, LSL #2\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "add x25, x25, x23\n" - ".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n" + ".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n" "add x25, x25, x23\n" "blt 19b\n" "20:" // Store to output array: Accumulator row 0 oddments "cbz x20, 21f\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n" - ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n" - ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 21f\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 21f\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "21:" // Store to output array: Accumulator row 0 oddments: End "22:" // Store to output array: End "tbz x15, #0, 24f\n" "mov x12, #0x0\n" "cntw x20\n" "23:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" - ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" + ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x14, x14, #16\n" @@ -417,4 +416,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, con } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp index 89c79cfb0a..ca7b0573fc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../bfloat.hpp" #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp index b63f2110ff..20c1de9418 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" - ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z21.s, #1.0\n" - ".inst 0xa00a428f // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n" - ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n" - ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n" - ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n" - ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n" + "fmov z12.s, #1.0\n" + ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n" + ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n" + ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n" + ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n" + ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n" - ".inst 0xa14026ff // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n" - ".inst 0xa0412768 // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa04126e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa1422772 // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04226f1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa1432776 // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n" + ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n" + ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14326ec // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "ble 7f\n" "6:" // K loop - ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n" + ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n" "subs x22, x22, #0x1\n" - ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n" - ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n" - ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n" - ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n" - ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n" - ".inst 0xa14026ff // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x23]\n" - ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n" - ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n" - ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n" - ".inst 0xa0412768 // ld1h { z8.h-z9.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n" - ".inst 0xa04126e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n" - ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n" - ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n" - ".inst 0xa1422772 // ld1h { z18.h, z26.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04226f1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n" - ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n" - ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n" - ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n" - ".inst 0xa1432776 // ld1h { z22.h, z30.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n" + ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n" + ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n" + ".inst 0xa0402772 // ld1h { z18.h-z19.h }, pn9.b/Z, [x27]\n" + ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n" + ".inst 0xa04026e3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x23]\n" + ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n" + ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n" + ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n" + ".inst 0xa0412764 // ld1h { z4.h-z5.h }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n" + ".inst 0xa04126fb // ldnt1h { z26.h-z27.h }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n" + ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n" + ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n" + ".inst 0xa042276a // ld1h { z10.h-z11.h }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa04226f5 // ldnt1h { z20.h-z21.h }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n" + ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n" + ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n" + ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n" + ".inst 0xa0432766 // ld1h { z6.h-z7.h }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14326ec // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa04326e9 // ldnt1h { z8.h-z9.h }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n" - ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n" - ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n" - ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n" - ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n" - ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n" - ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n" - ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n" - ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n" - ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n" - ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n" - ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n" - ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n" - ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n" - ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n" - ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n" + ".inst 0x81820240 // bfmopa za0.s, p0/M, p0/M, z18.h, z2.h\n" + ".inst 0x81830241 // bfmopa za1.s, p0/M, p0/M, z18.h, z3.h\n" + ".inst 0x81820262 // bfmopa za2.s, p0/M, p0/M, z19.h, z2.h\n" + ".inst 0x81830263 // bfmopa za3.s, p0/M, p0/M, z19.h, z3.h\n" + ".inst 0x819a0080 // bfmopa za0.s, p0/M, p0/M, z4.h, z26.h\n" + ".inst 0x819b0081 // bfmopa za1.s, p0/M, p0/M, z4.h, z27.h\n" + ".inst 0x819a00a2 // bfmopa za2.s, p0/M, p0/M, z5.h, z26.h\n" + ".inst 0x819b00a3 // bfmopa za3.s, p0/M, p0/M, z5.h, z27.h\n" + ".inst 0x81940140 // bfmopa za0.s, p0/M, p0/M, z10.h, z20.h\n" + ".inst 0x81950141 // bfmopa za1.s, p0/M, p0/M, z10.h, z21.h\n" + ".inst 0x81940162 // bfmopa za2.s, p0/M, p0/M, z11.h, z20.h\n" + ".inst 0x81950163 // bfmopa za3.s, p0/M, p0/M, z11.h, z21.h\n" + ".inst 0x818800c0 // bfmopa za0.s, p0/M, p0/M, z6.h, z8.h\n" + ".inst 0x818900c1 // bfmopa za1.s, p0/M, p0/M, z6.h, z9.h\n" + ".inst 0x818800e2 // bfmopa za2.s, p0/M, p0/M, z7.h, z8.h\n" + ".inst 0x818900e3 // bfmopa za3.s, p0/M, p0/M, z7.h, z9.h\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa1402767 // ld1h { z7.h, z15.h }, pn9.b/Z, [x27]\n" + ".inst 0xa040277e // ld1h { z30.h-z31.h }, pn9.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #2\n" - ".inst 0xa14026f7 // ld1h { z23.h, z31.h }, pn9.b/Z, [x23]\n" + ".inst 0xa14026e5 // ld1h { z5.h, z13.h }, pn9.b/Z, [x23]\n" "addvl x23, x23, #2\n" - ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n" - ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n" - ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n" - ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n" + ".inst 0x818503c0 // bfmopa za0.s, p0/M, p0/M, z30.h, z5.h\n" + ".inst 0x818d03c1 // bfmopa za1.s, p0/M, p0/M, z30.h, z13.h\n" + ".inst 0x818503e2 // bfmopa za2.s, p0/M, p0/M, z31.h, z5.h\n" + ".inst 0x818d03e3 // bfmopa za3.s, p0/M, p0/M, z31.h, z13.h\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -243,24 +242,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" - ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" + ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 30f\n" @@ -268,16 +267,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n" - ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" + ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" + ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 30f\n" @@ -312,16 +311,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "16:" // Store to output array: Skip activation: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" + ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" + ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 17f\n" - ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" + ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" "add x26, x26, x24\n" "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -349,16 +348,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "19:" // Store to output array: Skip activation: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" - ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" + ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" + ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" - ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" + ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n" "add x26, x26, x24\n" "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -367,44 +366,44 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "21:" // Store to output array: Skip activation: End "cntw x23\n" "cmp x25, x23\n" - "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x22, x25, x23, LT\n" "lsr x21, x22, #0x2\n" - "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x22, #0x3\n" "cbz x21, 23f\n" "22:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n" - ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n" - ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" + ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" + ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" + ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" + ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" "add x26, x26, x24\n" - ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n" + ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n" "add x26, x26, x24\n" "blt 22b\n" "23:" // Store to output array: Accumulator row 0 oddments "cbz x20, 24f\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n" - ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n" + ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n" + ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" - ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n" + ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n" "add x26, x26, x24\n" "24:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -418,8 +417,8 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "25:" // Store to output array: Accumulator row 1 loop ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" - ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" - ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" @@ -435,8 +434,8 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "cbz x20, 27f\n" ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" - ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" - ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" @@ -452,14 +451,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "29:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -483,4 +482,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, con } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp index 0d407e0cba..7b31d6d2db 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../bfloat.hpp" #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp index a51b3db4b0..70c94d32a3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z8.s, #1.0\n" - "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n" - ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n" + "fmov z11.s, #1.0\n" + "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n" + ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -166,75 +165,75 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n" - "ldnt1h { z29.h }, p1/Z, [x23]\n" - ".inst 0xa041a36c // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa042a360 // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa143a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n" + "ldnt1h { z19.h }, p1/Z, [x23]\n" + ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "ble 7f\n" "6:" // K loop - ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n" + ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n" "subs x22, x22, #0x1\n" - ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n" - ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n" - ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n" - ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n" - ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n" - "ldnt1h { z29.h }, p1/Z, [x23]\n" - ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n" - ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n" - ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n" - ".inst 0xa041a36c // ld1h { z12.h-z15.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n" - "ldnt1h { z23.h }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n" - ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n" - ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n" - ".inst 0xa042a360 // ld1h { z0.h-z3.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1h { z21.h }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n" - ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n" - ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n" - ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n" - ".inst 0xa143a372 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n" + ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n" + ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n" + ".inst 0xa140a360 // ld1h { z0.h, z4.h, z8.h, z12.h }, pn8.b/Z, [x27]\n" + ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n" + "ldnt1h { z19.h }, p1/Z, [x23]\n" + ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n" + ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n" + ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n" + ".inst 0xa141a371 // ld1h { z17.h, z21.h, z25.h, z29.h }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n" + "ldnt1h { z22.h }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n" + ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n" + ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n" + ".inst 0xa142a370 // ld1h { z16.h, z20.h, z24.h, z28.h }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1h { z23.h }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n" + ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n" + ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n" + ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n" + ".inst 0xa143a363 // ld1h { z3.h, z7.h, z11.h, z15.h }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1h { z2.h }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n" - ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n" - ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n" - ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n" - ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n" - ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n" - ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n" - ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n" - ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n" - ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n" - ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n" - ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n" - ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n" - ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n" - ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n" - ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n" + ".inst 0x81932400 // bfmopa za0.s, p1/M, p1/M, z0.h, z19.h\n" + ".inst 0x81932481 // bfmopa za1.s, p1/M, p1/M, z4.h, z19.h\n" + ".inst 0x81932502 // bfmopa za2.s, p1/M, p1/M, z8.h, z19.h\n" + ".inst 0x81932583 // bfmopa za3.s, p1/M, p1/M, z12.h, z19.h\n" + ".inst 0x81962620 // bfmopa za0.s, p1/M, p1/M, z17.h, z22.h\n" + ".inst 0x819626a1 // bfmopa za1.s, p1/M, p1/M, z21.h, z22.h\n" + ".inst 0x81962722 // bfmopa za2.s, p1/M, p1/M, z25.h, z22.h\n" + ".inst 0x819627a3 // bfmopa za3.s, p1/M, p1/M, z29.h, z22.h\n" + ".inst 0x81972600 // bfmopa za0.s, p1/M, p1/M, z16.h, z23.h\n" + ".inst 0x81972681 // bfmopa za1.s, p1/M, p1/M, z20.h, z23.h\n" + ".inst 0x81972702 // bfmopa za2.s, p1/M, p1/M, z24.h, z23.h\n" + ".inst 0x81972783 // bfmopa za3.s, p1/M, p1/M, z28.h, z23.h\n" + ".inst 0x81822460 // bfmopa za0.s, p1/M, p1/M, z3.h, z2.h\n" + ".inst 0x818224e1 // bfmopa za1.s, p1/M, p1/M, z7.h, z2.h\n" + ".inst 0x81822562 // bfmopa za2.s, p1/M, p1/M, z11.h, z2.h\n" + ".inst 0x818225e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z2.h\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa040a364 // ld1h { z4.h-z7.h }, pn8.b/Z, [x27]\n" + ".inst 0xa140a373 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn8.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #4\n" - "ld1h { z29.h }, p1/Z, [x23]\n" + "ld1h { z11.h }, p1/Z, [x23]\n" "addvl x23, x23, #1\n" - ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n" - ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n" - ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n" - ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n" + ".inst 0x818b2660 // bfmopa za0.s, p1/M, p1/M, z19.h, z11.h\n" + ".inst 0x818b26e1 // bfmopa za1.s, p1/M, p1/M, z23.h, z11.h\n" + ".inst 0x818b2762 // bfmopa za2.s, p1/M, p1/M, z27.h, z11.h\n" + ".inst 0x818b27e3 // bfmopa za3.s, p1/M, p1/M, z31.h, z11.h\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -242,25 +241,25 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n" + ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n" ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" - ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" + ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xa042c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n" - ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n" "addvl x15, x15, #16\n" ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 42f\n" @@ -269,15 +268,15 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" ".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n" ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n" - ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" - ".inst 0xa061c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 42f\n" @@ -296,16 +295,16 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x22, #0x3\n" "cbz x21, 16f\n" "15:" // Store to output array: Skip activation: Accumulator row 0 loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 15b\n" "16:" // Store to output array: Skip activation: Accumulator row 0 oddments @@ -331,30 +330,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x22, #0x3\n" "cbz x21, 19f\n" "18:" // Store to output array: Skip activation: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + "st1w { z8.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z9.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z10.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z11.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 18b\n" "19:" // Store to output array: Skip activation: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - "st1w { z4.s }, p0, [x26]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + "st1w { z24.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" "subs x20, x20, #0x1\n" - "st1w { z5.s }, p0, [x26]\n" + "st1w { z25.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" - "st1w { z6.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x26]\n" "add x26, x26, x24\n" "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -366,30 +365,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x22, #0x3\n" "cbz x21, 22f\n" "21:" // Store to output array: Skip activation: Accumulator row 2 loop - ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 21b\n" "22:" // Store to output array: Skip activation: Accumulator row 2 oddments "cbz x20, 23f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + "st1w { z12.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 23f\n" "subs x20, x20, #0x1\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z13.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 23f\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z14.s }, p0, [x26]\n" "add x26, x26, x24\n" "23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End "subs x25, x25, x22\n" @@ -401,30 +400,30 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x22, #0x3\n" "cbz x21, 25f\n" "24:" // Store to output array: Skip activation: Accumulator row 3 loop - ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" - "st1w { z4.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z5.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z6.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z7.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 24b\n" "25:" // Store to output array: Skip activation: Accumulator row 3 oddments "cbz x20, 26f\n" "subs x20, x20, #0x1\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - "st1w { z12.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 26f\n" "subs x20, x20, #0x1\n" - "st1w { z13.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 26f\n" - "st1w { z14.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End "subs x25, x25, x22\n" @@ -433,40 +432,40 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "27:" // Store to output array: Skip activation: End "cntw x23\n" "cmp x25, x23\n" - "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x22, x25, x23, LT\n" "lsr x21, x22, #0x2\n" - "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x22, #0x3\n" "cbz x21, 29f\n" "28:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" - ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" + ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1w { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z23.s }, p0, [x26]\n" + "st1w { z31.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 28b\n" "29:" // Store to output array: Accumulator row 0 oddments "cbz x20, 30f\n" - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n" - "st1w { z8.s }, p0, [x26]\n" + ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1w { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 30f\n" "subs x20, x20, #0x1\n" - "st1w { z9.s }, p0, [x26]\n" + "st1w { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 30f\n" - "st1w { z10.s }, p0, [x26]\n" + "st1w { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" "30:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -478,24 +477,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x22, #0x3\n" "cbz x21, 32f\n" "31:" // Store to output array: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 31b\n" "32:" // Store to output array: Accumulator row 1 oddments "cbz x20, 33f\n" ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 33f\n" @@ -516,7 +515,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "cbz x21, 35f\n" "34:" // Store to output array: Accumulator row 2 loop ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "st1w { z17.s }, p0, [x26]\n" @@ -532,7 +531,7 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "cbz x20, 36f\n" ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 36f\n" @@ -552,24 +551,24 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con "and x20, x20, #0x3\n" "cbz x21, 38f\n" "37:" // Store to output array: Accumulator row 3 loop - ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" - ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z23.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 37b\n" "38:" // Store to output array: Accumulator row 3 oddments "cbz x20, 39f\n" ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 39f\n" @@ -588,10 +587,10 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -615,4 +614,3 @@ void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, con } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp index 7777349b42..bf3de2118e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp index dd99387c5e..97be758bd6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,12 +112,12 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" + ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa042c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z21.s, #1.0\n" + "fmov z6.s, #1.0\n" ".inst 0xa009c29d // ldnt1w { z28.s-z31.s }, p8/Z, [x20, x9, LSL #2]\n" - ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n" - ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n" - ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n" - ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n" + ".inst 0x809c00c0 // fmopa za0.s, p0/M, p0/M, z6.s, z28.s\n" + ".inst 0x809d00c1 // fmopa za1.s, p0/M, p0/M, z6.s, z29.s\n" + ".inst 0x809e00c2 // fmopa za2.s, p0/M, p0/M, z6.s, z30.s\n" + ".inst 0x809f00c3 // fmopa za3.s, p0/M, p0/M, z6.s, z31.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x9\n" "mov x21, x10\n" @@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "madd x21, x9, x20, x21\n" // bptr = B + n * kstride_bytes "cbz x23, 8f\n" "subs x23, x23, #0x1\n" - "ld1w { z0.s }, p0/Z, [x26]\n" - ".inst 0xa140c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n" - "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0xa141c6aa // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" - "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n" - ".inst 0xa142c6ab // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n" - "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n" + "ld1w { z28.s }, p0/Z, [x26]\n" + ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n" + "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n" + ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" + "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n" + ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n" + "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n" "addvl x26, x26, #4\n" - ".inst 0xa143c6b8 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n" + ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n" "addvl x21, x21, #16\n" "ble 7f\n" "6:" // K loop - ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n" + ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n" "subs x23, x23, #0x1\n" - ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n" - ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n" - ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n" - "ld1w { z0.s }, p0/Z, [x26]\n" - ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n" - ".inst 0xa140c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n" - ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n" - ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n" - ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n" - "ld1w { z13.s }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n" - ".inst 0xa141c6aa // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" - ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n" - ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n" - ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n" - "ld1w { z12.s }, p0/Z, [x26, #2, MUL VL]\n" - ".inst 0xa142c6ab // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n" - ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n" - ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n" - ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n" - ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n" - "ld1w { z26.s }, p0/Z, [x26, #3, MUL VL]\n" + ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n" + ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n" + ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n" + "ld1w { z28.s }, p0/Z, [x26]\n" + ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n" + ".inst 0xa040c6a9 // ldnt1w { z8.s-z11.s }, pn9.b/Z, [x21]\n" + ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n" + ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n" + ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n" + "ld1w { z22.s }, p0/Z, [x26, #1, MUL VL]\n" + ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n" + ".inst 0xa041c6ad // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" + ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n" + ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n" + ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n" + "ld1w { z30.s }, p0/Z, [x26, #2, MUL VL]\n" + ".inst 0xa042c6a5 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x21, #0x8, MUL VL]\n" + ".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n" + ".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n" + ".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n" + ".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n" + "ld1w { z20.s }, p0/Z, [x26, #3, MUL VL]\n" "addvl x26, x26, #4\n" - ".inst 0xa143c6b8 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n" + ".inst 0xa143c6bb // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21, #0xc, MUL VL]\n" "addvl x21, x21, #16\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n" - ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n" - ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n" - ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n" - ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n" - ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n" - ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n" - ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n" - ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n" - ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n" - ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n" - ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n" - ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n" - ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n" - ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n" - ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n" + ".inst 0x80880380 // fmopa za0.s, p0/M, p0/M, z28.s, z8.s\n" + ".inst 0x80890381 // fmopa za1.s, p0/M, p0/M, z28.s, z9.s\n" + ".inst 0x808a0382 // fmopa za2.s, p0/M, p0/M, z28.s, z10.s\n" + ".inst 0x808b0383 // fmopa za3.s, p0/M, p0/M, z28.s, z11.s\n" + ".inst 0x808c02c0 // fmopa za0.s, p0/M, p0/M, z22.s, z12.s\n" + ".inst 0x808d02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z13.s\n" + ".inst 0x808e02c2 // fmopa za2.s, p0/M, p0/M, z22.s, z14.s\n" + ".inst 0x808f02c3 // fmopa za3.s, p0/M, p0/M, z22.s, z15.s\n" + ".inst 0x808403c0 // fmopa za0.s, p0/M, p0/M, z30.s, z4.s\n" + ".inst 0x808503c1 // fmopa za1.s, p0/M, p0/M, z30.s, z5.s\n" + ".inst 0x808603c2 // fmopa za2.s, p0/M, p0/M, z30.s, z6.s\n" + ".inst 0x808703c3 // fmopa za3.s, p0/M, p0/M, z30.s, z7.s\n" + ".inst 0x80930280 // fmopa za0.s, p0/M, p0/M, z20.s, z19.s\n" + ".inst 0x80970281 // fmopa za1.s, p0/M, p0/M, z20.s, z23.s\n" + ".inst 0x809b0282 // fmopa za2.s, p0/M, p0/M, z20.s, z27.s\n" + ".inst 0x809f0283 // fmopa za3.s, p0/M, p0/M, z20.s, z31.s\n" "8:" // K oddments "cbz x22, 10f\n" "9:" // K oddments: Loop - "ld1w { z0.s }, p0/Z, [x26]\n" + "ld1w { z8.s }, p0/Z, [x26]\n" "subs x22, x22, #0x1\n" "addvl x26, x26, #1\n" - ".inst 0xa140c6b3 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x21]\n" + ".inst 0xa140c6a3 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x21]\n" "addvl x21, x21, #4\n" - ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n" - ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n" - ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n" - ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n" + ".inst 0x80830100 // fmopa za0.s, p0/M, p0/M, z8.s, z3.s\n" + ".inst 0x80870101 // fmopa za1.s, p0/M, p0/M, z8.s, z7.s\n" + ".inst 0x808b0102 // fmopa za2.s, p0/M, p0/M, z8.s, z11.s\n" + ".inst 0x808f0103 // fmopa za3.s, p0/M, p0/M, z8.s, z15.s\n" "bgt 9b\n" "10:" // K oddments: End "tbz x15, #1, 14f\n" @@ -240,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" - ".inst 0xa041c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" - ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa042c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" - ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5d4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" + ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" + ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13]\n" + ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n" "addvl x14, x14, #16\n" - ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n" - ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n" - ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n" + ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n" + ".inst 0xa062c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n" + ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n" "addvl x13, x13, #16\n" "blt 11b\n" "b 24f\n" @@ -266,15 +265,15 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n" - ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n" + ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" - ".inst 0xa061c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x4, MUL VL]\n" + ".inst 0xa061c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n" + ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n" ".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n" "addvl x13, x13, #16\n" "blt 13b\n" @@ -312,18 +311,18 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "16:" // Store to output array: Skip activation: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 17f\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "add x25, x25, x23\n" "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End "subs x24, x24, x22\n" @@ -332,66 +331,66 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa "18:" // Store to output array: Skip activation: End "cntw x20\n" "cmp x24, x20\n" - "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x20, x24, x20, LT\n" "lsr x21, x20, #0x2\n" - "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x20, #0x3\n" "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n" - ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n" - ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "add x12, x12, #0x4\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "cmp x12, x21, LSL #2\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "add x25, x25, x23\n" - ".inst 0xa160c323 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x25]\n" + ".inst 0xa160c333 // st1w { z19.s, z23.s, z27.s, z31.s }, p8, [x25]\n" "add x25, x25, x23\n" "blt 19b\n" "20:" // Store to output array: Accumulator row 0 oddments "cbz x20, 21f\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n" - ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n" - ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c320 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x25]\n" + ".inst 0xa160c330 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 21f\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c321 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x25]\n" + ".inst 0xa160c331 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x25]\n" "add x25, x25, x23\n" "beq 21f\n" - ".inst 0xa160c322 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x25]\n" + ".inst 0xa160c332 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x25]\n" "21:" // Store to output array: Accumulator row 0 oddments: End "22:" // Store to output array: End "tbz x15, #0, 24f\n" "mov x12, #0x0\n" "cntw x20\n" "23:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" - ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n" + ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x14, x14, #16\n" @@ -415,4 +414,3 @@ void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const floa } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp index 51e8c43335..9bc1f83100 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp index 87d7827c5b..3c475044e2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" - ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa040c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa042c5f4 // ld1w { z20.s-z23.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z21.s, #1.0\n" - ".inst 0xa00a428f // ldnt1w { z14.s-z15.s }, p8/Z, [x20, x10, LSL #2]\n" - ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n" - ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n" - ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n" - ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n" + "fmov z12.s, #1.0\n" + ".inst 0xa10a4289 // ldnt1w { z1.s, z9.s }, p8/Z, [x20, x10, LSL #2]\n" + ".inst 0x80810180 // fmopa za0.s, p0/M, p0/M, z12.s, z1.s\n" + ".inst 0x80890181 // fmopa za1.s, p0/M, p0/M, z12.s, z9.s\n" + ".inst 0x80810182 // fmopa za2.s, p0/M, p0/M, z12.s, z1.s\n" + ".inst 0x80890183 // fmopa za3.s, p0/M, p0/M, z12.s, z9.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes "cbz x23, 8f\n" "subs x23, x23, #0x1\n" - ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n" - ".inst 0xa14046bf // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n" - ".inst 0xa0414768 // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa04146a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n" - ".inst 0xa1424772 // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04246b1 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" - ".inst 0xa1434776 // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n" + ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n" + ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n" + ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" + ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14346ac // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n" + ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n" "addvl x21, x21, #8\n" "ble 7f\n" "6:" // K loop - ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n" + ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n" "subs x23, x23, #0x1\n" - ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n" - ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n" - ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n" - ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n" - ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n" - ".inst 0xa14046bf // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x21]\n" - ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n" - ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n" - ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n" - ".inst 0xa0414768 // ld1w { z8.s-z9.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n" - ".inst 0xa04146a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n" - ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n" - ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n" - ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n" - ".inst 0xa1424772 // ld1w { z18.s, z26.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04246b1 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" - ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n" - ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n" - ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n" - ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n" - ".inst 0xa1434776 // ld1w { z22.s, z30.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n" + ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n" + ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n" + ".inst 0xa0404772 // ld1w { z18.s-z19.s }, pn9.b/Z, [x27]\n" + ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n" + ".inst 0xa04046a3 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x21]\n" + ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n" + ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n" + ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n" + ".inst 0xa0414764 // ld1w { z4.s-z5.s }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n" + ".inst 0xa04146bb // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x21, #0x2, MUL VL]\n" + ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n" + ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n" + ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n" + ".inst 0xa042476a // ld1w { z10.s-z11.s }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa04246b5 // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x21, #0x4, MUL VL]\n" + ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n" + ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n" + ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n" + ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n" + ".inst 0xa0434766 // ld1w { z6.s-z7.s }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14346ac // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n" + ".inst 0xa04346a9 // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x21, #0x6, MUL VL]\n" "addvl x21, x21, #8\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n" - ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n" - ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n" - ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n" - ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n" - ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n" - ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n" - ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n" - ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n" - ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n" - ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n" - ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n" - ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n" - ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n" - ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n" - ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n" + ".inst 0x80820240 // fmopa za0.s, p0/M, p0/M, z18.s, z2.s\n" + ".inst 0x80830241 // fmopa za1.s, p0/M, p0/M, z18.s, z3.s\n" + ".inst 0x80820262 // fmopa za2.s, p0/M, p0/M, z19.s, z2.s\n" + ".inst 0x80830263 // fmopa za3.s, p0/M, p0/M, z19.s, z3.s\n" + ".inst 0x809a0080 // fmopa za0.s, p0/M, p0/M, z4.s, z26.s\n" + ".inst 0x809b0081 // fmopa za1.s, p0/M, p0/M, z4.s, z27.s\n" + ".inst 0x809a00a2 // fmopa za2.s, p0/M, p0/M, z5.s, z26.s\n" + ".inst 0x809b00a3 // fmopa za3.s, p0/M, p0/M, z5.s, z27.s\n" + ".inst 0x80940140 // fmopa za0.s, p0/M, p0/M, z10.s, z20.s\n" + ".inst 0x80950141 // fmopa za1.s, p0/M, p0/M, z10.s, z21.s\n" + ".inst 0x80940162 // fmopa za2.s, p0/M, p0/M, z11.s, z20.s\n" + ".inst 0x80950163 // fmopa za3.s, p0/M, p0/M, z11.s, z21.s\n" + ".inst 0x808800c0 // fmopa za0.s, p0/M, p0/M, z6.s, z8.s\n" + ".inst 0x808900c1 // fmopa za1.s, p0/M, p0/M, z6.s, z9.s\n" + ".inst 0x808800e2 // fmopa za2.s, p0/M, p0/M, z7.s, z8.s\n" + ".inst 0x808900e3 // fmopa za3.s, p0/M, p0/M, z7.s, z9.s\n" "8:" // K oddments "cbz x22, 10f\n" "9:" // K oddments: Loop - ".inst 0xa1404767 // ld1w { z7.s, z15.s }, pn9.b/Z, [x27]\n" + ".inst 0xa040477e // ld1w { z30.s-z31.s }, pn9.b/Z, [x27]\n" "subs x22, x22, #0x1\n" "addvl x27, x27, #2\n" - ".inst 0xa14046b7 // ld1w { z23.s, z31.s }, pn9.b/Z, [x21]\n" + ".inst 0xa14046a5 // ld1w { z5.s, z13.s }, pn9.b/Z, [x21]\n" "addvl x21, x21, #2\n" - ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n" - ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n" - ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n" - ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n" + ".inst 0x808503c0 // fmopa za0.s, p0/M, p0/M, z30.s, z5.s\n" + ".inst 0x808d03c1 // fmopa za1.s, p0/M, p0/M, z30.s, z13.s\n" + ".inst 0x808503e2 // fmopa za2.s, p0/M, p0/M, z31.s, z5.s\n" + ".inst 0x808d03e3 // fmopa za3.s, p0/M, p0/M, z31.s, z13.s\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -241,24 +240,24 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" - ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa043c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" + ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 30f\n" @@ -266,16 +265,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n" - ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xa061c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" + ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" + ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 30f\n" @@ -310,16 +309,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "16:" // Store to output array: Skip activation: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" + ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" + ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 17f\n" - ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" + ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" "add x26, x26, x24\n" "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -347,16 +346,16 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "19:" // Store to output array: Skip activation: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" - ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" + ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" + ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" - ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" + ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n" "add x26, x26, x24\n" "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -365,44 +364,44 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "21:" // Store to output array: Skip activation: End "cntw x23\n" "cmp x25, x23\n" - "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z1.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x22, x25, x23, LT\n" "lsr x21, x22, #0x2\n" - "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z0.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x22, #0x3\n" "cbz x21, 23f\n" "22:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n" - ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n" - ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" + ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xc1a0c834 // fclamp { z20.s-z23.s }, z1.s, z0.s\n" + ".inst 0xc1a0c83c // fclamp { z28.s-z31.s }, z1.s, z0.s\n" + ".inst 0xa1604354 // st1w { z20.s, z28.s }, p8, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" + ".inst 0xa1604355 // st1w { z21.s, z29.s }, p8, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" + ".inst 0xa1604356 // st1w { z22.s, z30.s }, p8, [x26]\n" "add x26, x26, x24\n" - ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n" + ".inst 0xa1604357 // st1w { z23.s, z31.s }, p8, [x26]\n" "add x26, x26, x24\n" "blt 22b\n" "23:" // Store to output array: Accumulator row 0 oddments "cbz x20, 24f\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n" - ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n" + ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n" + ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" - ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n" + ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n" "add x26, x26, x24\n" "24:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -416,8 +415,8 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "25:" // Store to output array: Accumulator row 1 loop ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" - ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" - ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" @@ -433,8 +432,8 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "cbz x20, 27f\n" ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" - ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" - ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1a0c830 // fclamp { z16.s-z19.s }, z1.s, z0.s\n" + ".inst 0xc1a0c838 // fclamp { z24.s-z27.s }, z1.s, z0.s\n" "subs x20, x20, #0x1\n" ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" "add x26, x26, x24\n" @@ -450,14 +449,14 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "29:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -481,4 +480,3 @@ void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const floa } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp index a315ebb323..165e25dd8f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp index 291a7ced5a..ae1f812442 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -113,14 +112,14 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa042c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -138,12 +137,12 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "fmov z8.s, #1.0\n" - "ldnt1w { z27.s }, p0/Z, [x20, x10, LSL #2]\n" - ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n" - ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n" + "fmov z11.s, #1.0\n" + "ldnt1w { z13.s }, p0/Z, [x20, x10, LSL #2]\n" + ".inst 0x808d2560 // fmopa za0.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2561 // fmopa za1.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2562 // fmopa za2.s, p1/M, p1/M, z11.s, z13.s\n" + ".inst 0x808d2563 // fmopa za3.s, p1/M, p1/M, z11.s, z13.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -164,75 +163,75 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "madd x21, x10, x20, x21\n" // bptr = B + n * kstride_bytes "cbz x23, 8f\n" "subs x23, x23, #0x1\n" - ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n" - "ldnt1w { z29.s }, p1/Z, [x21]\n" - ".inst 0xa041c36c // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n" - ".inst 0xa042c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n" - ".inst 0xa143c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n" + "ldnt1w { z19.s }, p1/Z, [x21]\n" + ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n" + ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n" + ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n" "addvl x21, x21, #4\n" "ble 7f\n" "6:" // K loop - ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n" + ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n" "subs x23, x23, #0x1\n" - ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n" - ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n" - ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n" - ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n" - ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n" - "ldnt1w { z29.s }, p1/Z, [x21]\n" - ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n" - ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n" - ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n" - ".inst 0xa041c36c // ld1w { z12.s-z15.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n" - "ldnt1w { z23.s }, p1/Z, [x21, #1, MUL VL]\n" - ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n" - ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n" - ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n" - ".inst 0xa042c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1w { z21.s }, p1/Z, [x21, #2, MUL VL]\n" - ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n" - ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n" - ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n" - ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n" - ".inst 0xa143c372 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n" + ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n" + ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n" + ".inst 0xa140c360 // ld1w { z0.s, z4.s, z8.s, z12.s }, pn8.b/Z, [x27]\n" + ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n" + "ldnt1w { z19.s }, p1/Z, [x21]\n" + ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n" + ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n" + ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n" + ".inst 0xa141c371 // ld1w { z17.s, z21.s, z25.s, z29.s }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n" + "ldnt1w { z22.s }, p1/Z, [x21, #1, MUL VL]\n" + ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n" + ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n" + ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n" + ".inst 0xa142c370 // ld1w { z16.s, z20.s, z24.s, z28.s }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1w { z23.s }, p1/Z, [x21, #2, MUL VL]\n" + ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n" + ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n" + ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n" + ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n" + ".inst 0xa143c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ldnt1w { z2.s }, p1/Z, [x21, #3, MUL VL]\n" "addvl x21, x21, #4\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n" - ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n" - ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n" - ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n" - ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n" - ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n" - ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n" - ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n" - ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n" - ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n" - ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n" - ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n" - ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n" - ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n" - ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n" - ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n" + ".inst 0x80932400 // fmopa za0.s, p1/M, p1/M, z0.s, z19.s\n" + ".inst 0x80932481 // fmopa za1.s, p1/M, p1/M, z4.s, z19.s\n" + ".inst 0x80932502 // fmopa za2.s, p1/M, p1/M, z8.s, z19.s\n" + ".inst 0x80932583 // fmopa za3.s, p1/M, p1/M, z12.s, z19.s\n" + ".inst 0x80962620 // fmopa za0.s, p1/M, p1/M, z17.s, z22.s\n" + ".inst 0x809626a1 // fmopa za1.s, p1/M, p1/M, z21.s, z22.s\n" + ".inst 0x80962722 // fmopa za2.s, p1/M, p1/M, z25.s, z22.s\n" + ".inst 0x809627a3 // fmopa za3.s, p1/M, p1/M, z29.s, z22.s\n" + ".inst 0x80972600 // fmopa za0.s, p1/M, p1/M, z16.s, z23.s\n" + ".inst 0x80972681 // fmopa za1.s, p1/M, p1/M, z20.s, z23.s\n" + ".inst 0x80972702 // fmopa za2.s, p1/M, p1/M, z24.s, z23.s\n" + ".inst 0x80972783 // fmopa za3.s, p1/M, p1/M, z28.s, z23.s\n" + ".inst 0x80822460 // fmopa za0.s, p1/M, p1/M, z3.s, z2.s\n" + ".inst 0x808224e1 // fmopa za1.s, p1/M, p1/M, z7.s, z2.s\n" + ".inst 0x80822562 // fmopa za2.s, p1/M, p1/M, z11.s, z2.s\n" + ".inst 0x808225e3 // fmopa za3.s, p1/M, p1/M, z15.s, z2.s\n" "8:" // K oddments "cbz x22, 10f\n" "9:" // K oddments: Loop - ".inst 0xa040c364 // ld1w { z4.s-z7.s }, pn8.b/Z, [x27]\n" + ".inst 0xa140c373 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn8.b/Z, [x27]\n" "subs x22, x22, #0x1\n" "addvl x27, x27, #4\n" - "ld1w { z29.s }, p1/Z, [x21]\n" + "ld1w { z11.s }, p1/Z, [x21]\n" "addvl x21, x21, #1\n" - ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n" - ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n" - ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n" - ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n" + ".inst 0x808b2660 // fmopa za0.s, p1/M, p1/M, z19.s, z11.s\n" + ".inst 0x808b26e1 // fmopa za1.s, p1/M, p1/M, z23.s, z11.s\n" + ".inst 0x808b2762 // fmopa za2.s, p1/M, p1/M, z27.s, z11.s\n" + ".inst 0x808b27e3 // fmopa za3.s, p1/M, p1/M, z31.s, z11.s\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -240,25 +239,25 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15]\n" + ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n" ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" - ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" + ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xa042c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n" - ".inst 0xa043c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa042c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa060c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14]\n" "addvl x15, x15, #16\n" ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 42f\n" @@ -267,15 +266,15 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" ".inst 0xa060c1cc // st1w { z12.s-z15.s }, pn8.b, [x14]\n" ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n" - ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" - ".inst 0xa061c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa062c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 42f\n" @@ -294,16 +293,16 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x22, #0x3\n" "cbz x21, 16f\n" "15:" // Store to output array: Skip activation: Accumulator row 0 loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 15b\n" "16:" // Store to output array: Skip activation: Accumulator row 0 oddments @@ -329,30 +328,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x22, #0x3\n" "cbz x21, 19f\n" "18:" // Store to output array: Skip activation: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + "st1w { z8.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z9.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z10.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z11.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 18b\n" "19:" // Store to output array: Skip activation: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - "st1w { z4.s }, p0, [x26]\n" + ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" + "st1w { z24.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" "subs x20, x20, #0x1\n" - "st1w { z5.s }, p0, [x26]\n" + "st1w { z25.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 20f\n" - "st1w { z6.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x26]\n" "add x26, x26, x24\n" "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -364,30 +363,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x22, #0x3\n" "cbz x21, 22f\n" "21:" // Store to output array: Skip activation: Accumulator row 2 loop - ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 21b\n" "22:" // Store to output array: Skip activation: Accumulator row 2 oddments "cbz x20, 23f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + "st1w { z12.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 23f\n" "subs x20, x20, #0x1\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z13.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 23f\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z14.s }, p0, [x26]\n" "add x26, x26, x24\n" "23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End "subs x25, x25, x22\n" @@ -399,30 +398,30 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x22, #0x3\n" "cbz x21, 25f\n" "24:" // Store to output array: Skip activation: Accumulator row 3 loop - ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" - "st1w { z4.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z5.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z6.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z7.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 24b\n" "25:" // Store to output array: Skip activation: Accumulator row 3 oddments "cbz x20, 26f\n" "subs x20, x20, #0x1\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - "st1w { z12.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 26f\n" "subs x20, x20, #0x1\n" - "st1w { z13.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 26f\n" - "st1w { z14.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End "subs x25, x25, x22\n" @@ -431,40 +430,40 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "27:" // Store to output array: Skip activation: End "cntw x23\n" "cmp x25, x23\n" - "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "ld1rw { z21.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" "csel x22, x25, x23, LT\n" "lsr x21, x22, #0x2\n" - "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "ld1rw { z20.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" "mov x12, #0x0\n" "and x20, x22, #0x3\n" "cbz x21, 29f\n" "28:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" - ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" + ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1w { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z23.s }, p0, [x26]\n" + "st1w { z31.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 28b\n" "29:" // Store to output array: Accumulator row 0 oddments "cbz x20, 30f\n" - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n" - "st1w { z8.s }, p0, [x26]\n" + ".inst 0xc1b4cabc // fclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1w { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 30f\n" "subs x20, x20, #0x1\n" - "st1w { z9.s }, p0, [x26]\n" + "st1w { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 30f\n" - "st1w { z10.s }, p0, [x26]\n" + "st1w { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" "30:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -476,24 +475,24 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x22, #0x3\n" "cbz x21, 32f\n" "31:" // Store to output array: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 31b\n" "32:" // Store to output array: Accumulator row 1 oddments "cbz x20, 33f\n" ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 33f\n" @@ -514,7 +513,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "cbz x21, 35f\n" "34:" // Store to output array: Accumulator row 2 loop ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "st1w { z17.s }, p0, [x26]\n" @@ -530,7 +529,7 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "cbz x20, 36f\n" ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 36f\n" @@ -550,24 +549,24 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa "and x20, x20, #0x3\n" "cbz x21, 38f\n" "37:" // Store to output array: Accumulator row 3 loop - ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" - ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n" - "st1w { z20.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1w { z21.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" "add x12, x12, #0x4\n" - "st1w { z22.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" "cmp x12, x21, LSL #2\n" - "st1w { z23.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 37b\n" "38:" // Store to output array: Accumulator row 3 oddments "cbz x20, 39f\n" ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" "subs x20, x20, #0x1\n" - ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n" + ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 39f\n" @@ -586,10 +585,10 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -613,4 +612,3 @@ void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const floa } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp index b8bcd53c21..7b3cc77867 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp index 929af04032..aba677b158 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa042c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x13, x13, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa01cc299 // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n" - ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n" - ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n" - ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n" - ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n" + ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n" + ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n" + ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n" + ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n" + ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x28\n" "mov x21, x9\n" @@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - "ld1b { z10.b }, p1/Z, [x25]\n" - ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" - "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n" - ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n" - ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n" + "ld1b { z20.b }, p1/Z, [x25]\n" + ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" + "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n" + ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "ble 7f\n" "6:" // K loop - ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n" + ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n" - "ld1b { z10.b }, p1/Z, [x25]\n" - ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n" - ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" - ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n" - ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n" - ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n" - "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n" - ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n" - ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n" - ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n" - ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n" - "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n" - ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n" - ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n" - ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n" - ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n" - "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n" + ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n" + ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n" + "ld1b { z20.b }, p1/Z, [x25]\n" + ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n" + ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" + ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n" + ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n" + "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n" + ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n" + ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n" + ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n" + "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n" + ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n" + ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n" + ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n" + ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n" + "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n" - ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n" - ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n" - ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n" - ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n" - ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n" - ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n" - ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n" - ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n" - ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n" - ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n" - ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n" - ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n" - ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n" + ".inst 0xa0842680 // smopa za0.s, p1/M, p1/M, z20.b, z4.b\n" + ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa0862682 // smopa za2.s, p1/M, p1/M, z20.b, z6.b\n" + ".inst 0xa0872683 // smopa za3.s, p1/M, p1/M, z20.b, z7.b\n" + ".inst 0xa0982560 // smopa za0.s, p1/M, p1/M, z11.b, z24.b\n" + ".inst 0xa0992561 // smopa za1.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa09a2562 // smopa za2.s, p1/M, p1/M, z11.b, z26.b\n" + ".inst 0xa09b2563 // smopa za3.s, p1/M, p1/M, z11.b, z27.b\n" + ".inst 0xa09c2440 // smopa za0.s, p1/M, p1/M, z2.b, z28.b\n" + ".inst 0xa09d2441 // smopa za1.s, p1/M, p1/M, z2.b, z29.b\n" + ".inst 0xa09e2442 // smopa za2.s, p1/M, p1/M, z2.b, z30.b\n" + ".inst 0xa09f2443 // smopa za3.s, p1/M, p1/M, z2.b, z31.b\n" + ".inst 0xa09025c0 // smopa za0.s, p1/M, p1/M, z14.b, z16.b\n" + ".inst 0xa09125c1 // smopa za1.s, p1/M, p1/M, z14.b, z17.b\n" + ".inst 0xa09225c2 // smopa za2.s, p1/M, p1/M, z14.b, z18.b\n" + ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - "ld1b { z10.b }, p1/Z, [x25]\n" + "ld1b { z16.b }, p1/Z, [x25]\n" "subs x21, x21, #0x1\n" "addvl x25, x25, #1\n" - ".inst 0xa04086fc // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" + ".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #4\n" - ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n" - ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n" + ".inst 0xa0842600 // smopa za0.s, p1/M, p1/M, z16.b, z4.b\n" + ".inst 0xa0852601 // smopa za1.s, p1/M, p1/M, z16.b, z5.b\n" + ".inst 0xa0862602 // smopa za2.s, p1/M, p1/M, z16.b, z6.b\n" + ".inst 0xa0872603 // smopa za3.s, p1/M, p1/M, z16.b, z7.b\n" "bgt 9b\n" "10:" // K oddments: End - "ld1w { z14.s }, p1/Z, [x25]\n" + "ld1w { z15.s }, p1/Z, [x25]\n" "addvl x25, x25, #1\n" - ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n" + ".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n" "tbz x14, #1, 14f\n" "tbz x14, #0, 12f\n" "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5b8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n" - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" + ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" + ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n" + ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n" "addvl x13, x13, #16\n" - ".inst 0xa061c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n" - ".inst 0xa062c578 // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n" - ".inst 0xa063c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n" + ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n" + ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n" + ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n" "addvl x11, x11, #16\n" "blt 11b\n" "b 21f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" - ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa061c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" + ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n" - ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n" + ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n" + ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n" "addvl x11, x11, #16\n" "blt 13b\n" "b 21f\n" @@ -277,17 +276,17 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "ldr x24, [%x[args], %[offsetof_C]]\n" "add x24, x24, x28\n" // C += n "sub x23, x10, x9\n" - "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x22, [%x[args], %[offsetof_ldcb]]\n" "madd x24, x9, x22, x24\n" // C += m * ldc - "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x14, #2, 15f\n" @@ -295,10 +294,10 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "add x21, x21, x28\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n" + ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n" + ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x20\n" "whilelt p0.b, x28, x27\n" @@ -311,22 +310,22 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "16:" // Store to output array: Accumulator row 0 loop ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n" ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n" - ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n" + ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n" ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n" ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n" - ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n" - ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n" + ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n" + ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n" "add x12, x12, #0x2\n" "cmp x12, x21, LSL #1\n" - ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n" - ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n" - ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n" - ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n" - ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n" - ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n" - ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n" - ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n" - ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n" + ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n" + ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n" + ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n" + ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n" + ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n" + ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n" + ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n" + ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n" + ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n" ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n" ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n" "uzp1 z19.b, z26.b, z28.b\n" @@ -344,29 +343,29 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" - ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n" + ".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n" ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n" - ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n" - ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n" - ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n" - ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n" - ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n" - ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n" - ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n" - ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n" - ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n" - ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n" - ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n" - ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n" - ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n" - ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n" - ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n" - ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n" - "uzp1 z23.b, z2.b, z24.b\n" - ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n" + ".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n" + ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n" + ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n" + ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n" + ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n" + ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n" + ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n" + ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n" + ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n" + ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n" + ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n" + ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n" + ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n" + ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n" ".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n" - "uzp1 z16.b, z16.b, z10.b\n" - "uzp1 z16.b, z23.b, z16.b\n" + ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n" + "uzp1 z17.b, z10.b, z24.b\n" + ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n" + "uzp1 z16.b, z26.b, z30.b\n" + "uzp1 z16.b, z17.b, z16.b\n" "st1b { z16.b }, p0, [x24]\n" "18:" // Store to output array: Accumulator row 0 oddments: End "19:" // Store to output array: End @@ -374,14 +373,14 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "20:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa041c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x13, x13, #16\n" @@ -405,4 +404,3 @@ void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8 } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp index 954b0da0e1..79990f72e5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp index 0b642818e2..7033de5fe3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n" - ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n" - ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n" - ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n" - ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n" + ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n" + ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n" + ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n" + ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n" + ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -152,75 +151,75 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" - ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" - ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n" + ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" + ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "ble 7f\n" "6:" // K loop - ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n" + ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n" - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" - ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n" - ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" - ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n" - ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n" - ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n" - ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n" - ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n" - ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n" - ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n" - ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n" - ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n" - ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n" - ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n" - ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n" + ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n" + ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n" + ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n" + ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" + ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n" + ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n" + ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n" + ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n" + ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n" + ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n" + ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n" + ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n" + ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n" + ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n" + ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n" + ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n" - ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n" - ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n" - ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n" - ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n" - ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n" - ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n" - ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n" - ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n" - ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n" - ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n" - ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n" - ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n" - ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n" + ".inst 0xa0912460 // smopa za0.s, p1/M, p1/M, z3.b, z17.b\n" + ".inst 0xa0992461 // smopa za1.s, p1/M, p1/M, z3.b, z25.b\n" + ".inst 0xa0912562 // smopa za2.s, p1/M, p1/M, z11.b, z17.b\n" + ".inst 0xa0992563 // smopa za3.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa0962680 // smopa za0.s, p1/M, p1/M, z20.b, z22.b\n" + ".inst 0xa0972681 // smopa za1.s, p1/M, p1/M, z20.b, z23.b\n" + ".inst 0xa0962782 // smopa za2.s, p1/M, p1/M, z28.b, z22.b\n" + ".inst 0xa0972783 // smopa za3.s, p1/M, p1/M, z28.b, z23.b\n" + ".inst 0xa09026a0 // smopa za0.s, p1/M, p1/M, z21.b, z16.b\n" + ".inst 0xa09826a1 // smopa za1.s, p1/M, p1/M, z21.b, z24.b\n" + ".inst 0xa09027a2 // smopa za2.s, p1/M, p1/M, z29.b, z16.b\n" + ".inst 0xa09827a3 // smopa za3.s, p1/M, p1/M, z29.b, z24.b\n" + ".inst 0xa08724a0 // smopa za0.s, p1/M, p1/M, z5.b, z7.b\n" + ".inst 0xa08f24a1 // smopa za1.s, p1/M, p1/M, z5.b, z15.b\n" + ".inst 0xa08725a2 // smopa za2.s, p1/M, p1/M, z13.b, z7.b\n" + ".inst 0xa08f25a3 // smopa za3.s, p1/M, p1/M, z13.b, z15.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" + ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #2\n" ".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #2\n" - ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n" - ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n" + ".inst 0xa0902660 // smopa za0.s, p1/M, p1/M, z19.b, z16.b\n" + ".inst 0xa0912661 // smopa za1.s, p1/M, p1/M, z19.b, z17.b\n" + ".inst 0xa0902762 // smopa za2.s, p1/M, p1/M, z27.b, z16.b\n" + ".inst 0xa0912763 // smopa za3.s, p1/M, p1/M, z27.b, z17.b\n" "bgt 9b\n" "10:" // K oddments: End ".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n" @@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" - ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" + ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n" + ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 24f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" + ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 24f\n" @@ -277,13 +276,13 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "ldr x26, [%x[args], %[offsetof_C]]\n" "add x26, x26, x10\n" // C += n "sub x25, x13, x11\n" - "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x24, [%x[args], %[offsetof_ldcb]]\n" "madd x26, x11, x24, x26\n" // C += m * ldc - "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x16, #2, 15f\n" @@ -291,10 +290,10 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "add x21, x21, x10\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n" + ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n" + ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x23\n" "whilelt p0.h, x10, x9\n" @@ -305,26 +304,26 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "and x20, x22, #0x3\n" "cbz x21, 17f\n" "16:" // Store to output array: Accumulator row 0 loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" - ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" "add x12, x12, #0x4\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n" - ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" - ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" - ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n" - ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n" - "uzp1 z16.h, z12.h, z28.h\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" + ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n" + "uzp1 z16.h, z4.h, z8.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z13.h, z29.h\n" - "uzp1 z17.h, z14.h, z30.h\n" + "uzp1 z16.h, z5.h, z9.h\n" + "uzp1 z17.h, z6.h, z10.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z15.h, z31.h\n" + "uzp1 z16.h, z7.h, z11.h\n" "st1b { z17.h }, p0, [x26]\n" "add x26, x26, x24\n" "st1b { z16.h }, p0, [x26]\n" @@ -332,27 +331,27 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" - ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n" + ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" - ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" - ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n" - ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" - ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n" - "uzp1 z16.h, z28.h, z12.h\n" + ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" + ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n" + ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" + "uzp1 z16.h, z8.h, z4.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" "subs x20, x20, #0x1\n" - "uzp1 z16.h, z29.h, z13.h\n" + "uzp1 z16.h, z9.h, z5.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" - "uzp1 z16.h, z30.h, z14.h\n" + "uzp1 z16.h, z10.h, z6.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "18:" // Store to output array: Accumulator row 0 oddments: End @@ -367,25 +366,25 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 1 loop ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" - ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n" "add x12, x12, #0x4\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" - ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" - ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n" - ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n" ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n" - "uzp1 z16.h, z4.h, z16.h\n" + ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n" + "uzp1 z16.h, z4.h, z20.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z5.h, z17.h\n" - "uzp1 z17.h, z6.h, z18.h\n" + "uzp1 z16.h, z5.h, z21.h\n" + "uzp1 z17.h, z6.h, z22.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z7.h, z19.h\n" + "uzp1 z16.h, z7.h, z23.h\n" "st1b { z17.h }, p0, [x26]\n" "add x26, x26, x24\n" "st1b { z16.h }, p0, [x26]\n" @@ -393,27 +392,27 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 "blt 19b\n" "20:" // Store to output array: Accumulator row 1 oddments "cbz x20, 21f\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n" - ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n" - ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" - ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n" - ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" - ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n" - "uzp1 z16.h, z20.h, z16.h\n" + "uzp1 z16.h, z4.h, z16.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" "subs x20, x20, #0x1\n" - "uzp1 z16.h, z21.h, z17.h\n" + "uzp1 z16.h, z5.h, z17.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" - "uzp1 z16.h, z22.h, z18.h\n" + "uzp1 z16.h, z6.h, z18.h\n" "st1b { z16.h }, p0, [x26]\n" "21:" // Store to output array: Accumulator row 1 oddments: End "22:" // Store to output array: End @@ -452,4 +451,3 @@ void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8 } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp index 420c219af5..ef39cbbb28 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp index 0d0e3da224..4601f05501 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" - ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n" - ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n" + "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n" + ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" - "ldnt1b { z0.b }, p1/Z, [x23]\n" - ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n" + "ldnt1b { z14.b }, p1/Z, [x23]\n" + ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "ble 7f\n" "6:" // K loop - ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n" + ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n" - ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" - ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n" - "ldnt1b { z0.b }, p1/Z, [x23]\n" - ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n" - ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n" - ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n" - ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n" - "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n" - ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n" - ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n" - ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n" - ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n" - ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n" - ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n" + ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n" + ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n" + ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n" + ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n" + "ldnt1b { z14.b }, p1/Z, [x23]\n" + ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n" + ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n" + ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n" + ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n" + "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n" + ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n" + ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n" + ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n" + ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n" + ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n" + ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n" + ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n" - ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n" - ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n" - ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n" - ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n" - ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n" - ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n" - ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n" - ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n" - ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n" - ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n" - ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n" - ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n" + ".inst 0xa08e2480 // smopa za0.s, p1/M, p1/M, z4.b, z14.b\n" + ".inst 0xa08e24a1 // smopa za1.s, p1/M, p1/M, z5.b, z14.b\n" + ".inst 0xa08e24c2 // smopa za2.s, p1/M, p1/M, z6.b, z14.b\n" + ".inst 0xa08e24e3 // smopa za3.s, p1/M, p1/M, z7.b, z14.b\n" + ".inst 0xa09f2680 // smopa za0.s, p1/M, p1/M, z20.b, z31.b\n" + ".inst 0xa09f26a1 // smopa za1.s, p1/M, p1/M, z21.b, z31.b\n" + ".inst 0xa09f26c2 // smopa za2.s, p1/M, p1/M, z22.b, z31.b\n" + ".inst 0xa09f26e3 // smopa za3.s, p1/M, p1/M, z23.b, z31.b\n" + ".inst 0xa08d2700 // smopa za0.s, p1/M, p1/M, z24.b, z13.b\n" + ".inst 0xa08d2721 // smopa za1.s, p1/M, p1/M, z25.b, z13.b\n" + ".inst 0xa08d2742 // smopa za2.s, p1/M, p1/M, z26.b, z13.b\n" + ".inst 0xa08d2763 // smopa za3.s, p1/M, p1/M, z27.b, z13.b\n" + ".inst 0xa09d2500 // smopa za0.s, p1/M, p1/M, z8.b, z29.b\n" + ".inst 0xa09d2521 // smopa za1.s, p1/M, p1/M, z9.b, z29.b\n" + ".inst 0xa09d2542 // smopa za2.s, p1/M, p1/M, z10.b, z29.b\n" + ".inst 0xa09d2563 // smopa za3.s, p1/M, p1/M, z11.b, z29.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #4\n" - "ld1b { z0.b }, p1/Z, [x23]\n" + "ld1b { z15.b }, p1/Z, [x23]\n" "addvl x23, x23, #1\n" - ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n" - ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n" + ".inst 0xa08f2640 // smopa za0.s, p1/M, p1/M, z18.b, z15.b\n" + ".inst 0xa08f26c1 // smopa za1.s, p1/M, p1/M, z22.b, z15.b\n" + ".inst 0xa08f2742 // smopa za2.s, p1/M, p1/M, z26.b, z15.b\n" + ".inst 0xa08f27c3 // smopa za3.s, p1/M, p1/M, z30.b, z15.b\n" "bgt 9b\n" "10:" // K oddments: End - ".inst 0xa040c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n" + ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n" "addvl x27, x27, #4\n" - ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n" - ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n" - ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n" - ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n" + ".inst 0xc0912460 // addva za0.s, p1/M, p1/M, z3.s\n" + ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n" + ".inst 0xc0912562 // addva za2.s, p1/M, p1/M, z11.s\n" + ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n" "tbz x16, #1, 14f\n" "tbz x16, #0, 12f\n" "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa041c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n" - ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n" + ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 30f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" + ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 30f\n" @@ -277,22 +276,22 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "ldr x26, [%x[args], %[offsetof_C]]\n" "add x26, x26, x10\n" // C += n "sub x25, x13, x11\n" - "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x24, [%x[args], %[offsetof_ldcb]]\n" "madd x26, x11, x24, x26\n" // C += m * ldc - "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" - "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" - "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" + "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" + "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x16, #2, 15f\n" "ldr w21, [%x[args], %[offsetof_n_0]]\n" "add x21, x21, x10\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - "ld1w { z8.s }, p0/Z, [x20]\n" + "ld1w { z2.s }, p0/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - "ld1w { z7.s }, p0/Z, [x20]\n" + "ld1w { z1.s }, p0/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x23\n" "whilelt p0.s, x10, x9\n" @@ -303,30 +302,30 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "and x20, x22, #0x3\n" "cbz x21, 17f\n" "16:" // Store to output array: Accumulator row 0 loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n" - "st1b { z12.s }, p0, [x26]\n" + ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n" + ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n" + "st1b { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z13.s }, p0, [x26]\n" + "st1b { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z14.s }, p0, [x26]\n" + "st1b { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z15.s }, p0, [x26]\n" + "st1b { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n" + ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" - ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" - ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n" + ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" + ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n" + ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1b { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" @@ -347,38 +346,38 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "and x20, x22, #0x3\n" "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" - ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n" - "st1b { z16.s }, p0, [x26]\n" + ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1b { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z17.s }, p0, [x26]\n" + "st1b { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z18.s }, p0, [x26]\n" + "st1b { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z19.s }, p0, [x26]\n" + "st1b { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 19b\n" "20:" // Store to output array: Accumulator row 1 oddments "cbz x20, 21f\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n" - "st1b { z28.s }, p0, [x26]\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" + ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1b { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" "subs x20, x20, #0x1\n" - "st1b { z29.s }, p0, [x26]\n" + "st1b { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" - "st1b { z30.s }, p0, [x26]\n" + "st1b { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "21:" // Store to output array: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -391,30 +390,30 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "and x20, x22, #0x3\n" "cbz x21, 23f\n" "22:" // Store to output array: Accumulator row 2 loop - ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" - ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n" + ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" + ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n" + ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" - ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n" - "st1b { z24.s }, p0, [x26]\n" + ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n" + ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n" + "st1b { z8.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z25.s }, p0, [x26]\n" + "st1b { z9.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z26.s }, p0, [x26]\n" + "st1b { z10.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z27.s }, p0, [x26]\n" + "st1b { z11.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 22b\n" "23:" // Store to output array: Accumulator row 2 oddments "cbz x20, 24f\n" ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n" + ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n" + ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n" + ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n" "st1b { z12.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" @@ -435,52 +434,52 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 "and x20, x20, #0x3\n" "cbz x21, 26f\n" "25:" // Store to output array: Accumulator row 3 loop - ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" - ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n" + ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n" - ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n" - "st1b { z20.s }, p0, [x26]\n" + ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" + ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1b { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z21.s }, p0, [x26]\n" + "st1b { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z22.s }, p0, [x26]\n" + "st1b { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z23.s }, p0, [x26]\n" + "st1b { z31.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 25b\n" "26:" // Store to output array: Accumulator row 3 oddments "cbz x20, 27f\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n" - "st1b { z0.s }, p0, [x26]\n" + ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" + ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" + ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1b { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 27f\n" "subs x20, x20, #0x1\n" - "st1b { z1.s }, p0, [x26]\n" + "st1b { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 27f\n" - "st1b { z2.s }, p0, [x26]\n" + "st1b { z30.s }, p0, [x26]\n" "27:" // Store to output array: Accumulator row 3 oddments: End "28:" // Store to output array: End "tbz x16, #0, 30f\n" "mov x12, #0x0\n" "cntw x20\n" "29:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -504,4 +503,3 @@ void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8 } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp index c969c7aaff..b9d8b60c8d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -32,7 +32,7 @@ namespace arm_gemm { // Implementations -void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); +void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL { @@ -40,7 +40,7 @@ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); + typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); /* Kernel blocking parameters */ static unsigned int out_height() @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp index 12e714a471..d11faa634d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -32,10 +31,8 @@ namespace arm_gemm { -void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer) +void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer) { - ARM_COMPUTE_UNUSED(act); - struct KernelArgs { KernelArgs( @@ -96,12 +93,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "1:" // Initial accumulator load from buffer: Loop ".inst 0xa040c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11]\n" ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n" - ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n" - ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" - ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n" + ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n" + ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x11, x11, #16\n" @@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa11bc28a // ldnt1w { z2.s, z6.s, z10.s, z14.s }, p8/Z, [x20, x27, LSL #2]\n" - ".inst 0xc0900040 // addha za0.s, p0/M, p0/M, z2.s\n" - ".inst 0xc09000c1 // addha za1.s, p0/M, p0/M, z6.s\n" - ".inst 0xc0900142 // addha za2.s, p0/M, p0/M, z10.s\n" - ".inst 0xc09001c3 // addha za3.s, p0/M, p0/M, z14.s\n" + ".inst 0xa11bc29b // ldnt1w { z19.s, z23.s, z27.s, z31.s }, p8/Z, [x20, x27, LSL #2]\n" + ".inst 0xc0900260 // addha za0.s, p0/M, p0/M, z19.s\n" + ".inst 0xc09002e1 // addha za1.s, p0/M, p0/M, z23.s\n" + ".inst 0xc0900362 // addha za2.s, p0/M, p0/M, z27.s\n" + ".inst 0xc09003e3 // addha za3.s, p0/M, p0/M, z31.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x27\n" "mov x21, x28\n" @@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "madd x23, x27, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - "ld1b { z20.b }, p0/Z, [x24]\n" - ".inst 0xa14086e9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n" - "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n" - ".inst 0xa14186fa // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n" - ".inst 0xa14286eb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n" + "ld1b { z30.b }, p0/Z, [x24]\n" + ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n" + "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n" + ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n" "addvl x24, x24, #4\n" - ".inst 0xa14386e8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "ble 7f\n" "6:" // K loop - ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n" + ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n" - ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n" - ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n" - "ld1b { z20.b }, p0/Z, [x24]\n" - ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n" - ".inst 0xa14086e9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n" - ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n" - ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n" - ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n" - "ld1b { z10.b }, p0/Z, [x24, #1, MUL VL]\n" - ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n" - ".inst 0xa14186fa // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n" - ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n" - ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n" - "ld1b { z16.b }, p0/Z, [x24, #2, MUL VL]\n" - ".inst 0xa14286eb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n" - ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n" - ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n" - ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n" - "ld1b { z25.b }, p0/Z, [x24, #3, MUL VL]\n" + ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n" + ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n" + ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n" + "ld1b { z30.b }, p0/Z, [x24]\n" + ".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n" + ".inst 0xa04086e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23]\n" + ".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n" + ".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n" + ".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n" + "ld1b { z21.b }, p0/Z, [x24, #1, MUL VL]\n" + ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n" + ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n" + ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n" + "ld1b { z28.b }, p0/Z, [x24, #2, MUL VL]\n" + ".inst 0xa04286e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n" + ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n" + ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n" + ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n" + "ld1b { z11.b }, p0/Z, [x24, #3, MUL VL]\n" "addvl x24, x24, #4\n" - ".inst 0xa14386e8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n" - ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n" - ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n" - ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n" - ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n" - ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n" - ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n" - ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n" - ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n" - ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n" - ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n" - ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n" - ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n" - ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n" - ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n" - ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n" + ".inst 0xa08003c0 // smopa za0.s, p0/M, p0/M, z30.b, z0.b\n" + ".inst 0xa08103c1 // smopa za1.s, p0/M, p0/M, z30.b, z1.b\n" + ".inst 0xa08203c2 // smopa za2.s, p0/M, p0/M, z30.b, z2.b\n" + ".inst 0xa08303c3 // smopa za3.s, p0/M, p0/M, z30.b, z3.b\n" + ".inst 0xa09802a0 // smopa za0.s, p0/M, p0/M, z21.b, z24.b\n" + ".inst 0xa09902a1 // smopa za1.s, p0/M, p0/M, z21.b, z25.b\n" + ".inst 0xa09a02a2 // smopa za2.s, p0/M, p0/M, z21.b, z26.b\n" + ".inst 0xa09b02a3 // smopa za3.s, p0/M, p0/M, z21.b, z27.b\n" + ".inst 0xa0840380 // smopa za0.s, p0/M, p0/M, z28.b, z4.b\n" + ".inst 0xa0850381 // smopa za1.s, p0/M, p0/M, z28.b, z5.b\n" + ".inst 0xa0860382 // smopa za2.s, p0/M, p0/M, z28.b, z6.b\n" + ".inst 0xa0870383 // smopa za3.s, p0/M, p0/M, z28.b, z7.b\n" + ".inst 0xa0900160 // smopa za0.s, p0/M, p0/M, z11.b, z16.b\n" + ".inst 0xa0910161 // smopa za1.s, p0/M, p0/M, z11.b, z17.b\n" + ".inst 0xa0920162 // smopa za2.s, p0/M, p0/M, z11.b, z18.b\n" + ".inst 0xa0930163 // smopa za3.s, p0/M, p0/M, z11.b, z19.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - "ld1b { z20.b }, p0/Z, [x24]\n" + "ld1b { z22.b }, p0/Z, [x24]\n" "subs x21, x21, #0x1\n" "addvl x24, x24, #1\n" - ".inst 0xa14086e1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x23]\n" + ".inst 0xa14086f1 // ld1b { z17.b, z21.b, z25.b, z29.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #4\n" - ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n" - ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n" - ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n" - ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n" + ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n" + ".inst 0xa09502c1 // smopa za1.s, p0/M, p0/M, z22.b, z21.b\n" + ".inst 0xa09902c2 // smopa za2.s, p0/M, p0/M, z22.b, z25.b\n" + ".inst 0xa09d02c3 // smopa za3.s, p0/M, p0/M, z22.b, z29.b\n" "bgt 9b\n" "10:" // K oddments: End "tbz x13, #1, 14f\n" @@ -222,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11]\n" + ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n" ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" - ".inst 0xa041c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n" - ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" - ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" - ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" - ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n" - ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" + ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + ".inst 0xa042c560 // ld1w { z0.s-z3.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n" + ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa043c574 // ld1w { z20.s-z23.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa060c544 // st1w { z4.s-z7.s }, pn9.b, [x10]\n" "addvl x11, x11, #16\n" - ".inst 0xa061c554 // st1w { z20.s-z23.s }, pn9.b, [x10, #0x4, MUL VL]\n" - ".inst 0xa062c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n" - ".inst 0xa063c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0xc, MUL VL]\n" + ".inst 0xa061c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0x4, MUL VL]\n" + ".inst 0xa062c55c // st1w { z28.s-z31.s }, pn9.b, [x10, #0x8, MUL VL]\n" + ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n" "addvl x10, x10, #16\n" "blt 11b\n" "b 20f\n" @@ -248,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n" - ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" - ".inst 0xa060c554 // st1w { z20.s-z23.s }, pn9.b, [x10]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa061c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n" + ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa060c54c // st1w { z12.s-z15.s }, pn9.b, [x10]\n" + ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n" + ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" + ".inst 0xa061c544 // st1w { z4.s-z7.s }, pn9.b, [x10, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n" - ".inst 0xa063c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n" + ".inst 0xa062c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x8, MUL VL]\n" + ".inst 0xa063c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0xc, MUL VL]\n" "addvl x10, x10, #16\n" "blt 13b\n" "b 20f\n" @@ -293,32 +290,32 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in "16:" // Store to output array: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa160c2e0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x23]\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa160c2f0 // st1w { z16.s, z20.s, z24.s, z28.s }, p8, [x23]\n" "add x23, x23, x22\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa160c2e1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x23]\n" + ".inst 0xa160c2f1 // st1w { z17.s, z21.s, z25.s, z29.s }, p8, [x23]\n" "add x23, x23, x22\n" "beq 17f\n" - ".inst 0xa160c2e2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x23]\n" + ".inst 0xa160c2f2 // st1w { z18.s, z22.s, z26.s, z30.s }, p8, [x23]\n" "17:" // Store to output array: Accumulator row 0 oddments: End "18:" // Store to output array: End "tbz x13, #0, 20f\n" "mov x12, #0x0\n" "cntw x20\n" "19:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11]\n" + ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa043c568 // ld1w { z8.s-z11.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x11, x11, #16\n" @@ -342,4 +339,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const in } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp index a0705e50cd..f05d2cf215 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -32,7 +32,7 @@ namespace arm_gemm { // Implementations -void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); +void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL { @@ -40,7 +40,7 @@ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); + typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); /* Kernel blocking parameters */ static unsigned int out_height() @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp index d7a7528211..47de894306 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -32,10 +31,8 @@ namespace arm_gemm { -void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer) +void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer) { - ARM_COMPUTE_UNUSED(act); - struct KernelArgs { KernelArgs( @@ -96,12 +93,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "1:" // Initial accumulator load from buffer: Loop ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n" ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" - ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa042c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa10a429c // ldnt1w { z20.s, z28.s }, p8/Z, [x20, x10, LSL #2]\n" + ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n" ".inst 0xc0900280 // addha za0.s, p0/M, p0/M, z20.s\n" - ".inst 0xc0900381 // addha za1.s, p0/M, p0/M, z28.s\n" + ".inst 0xc09002a1 // addha za1.s, p0/M, p0/M, z21.s\n" ".inst 0xc0900282 // addha za2.s, p0/M, p0/M, z20.s\n" - ".inst 0xc0900383 // addha za3.s, p0/M, p0/M, z28.s\n" + ".inst 0xc09002a3 // addha za3.s, p0/M, p0/M, z21.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n" - ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" - ".inst 0xa1410770 // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa14106eb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa0420768 // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04206f3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n" + ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n" + ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306fd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "ble 7f\n" "6:" // K loop - ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n" + ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n" - ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n" - ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n" - ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n" - ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n" - ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" - ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n" - ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n" - ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n" - ".inst 0xa1410770 // ld1b { z16.b, z24.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n" - ".inst 0xa14106eb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n" - ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n" - ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n" - ".inst 0xa0420768 // ld1b { z8.b-z9.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa04206f3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n" - ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n" - ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n" - ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n" + ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n" + ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n" + ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n" + ".inst 0xa040077c // ld1b { z28.b-z29.b }, pn9.b/Z, [x27]\n" + ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n" + ".inst 0xa14006e8 // ldnt1b { z0.b, z8.b }, pn9.b/Z, [x23]\n" + ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n" + ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n" + ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n" + ".inst 0xa0410762 // ld1b { z2.b-z3.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n" + ".inst 0xa14106ff // ldnt1b { z23.b, z31.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n" + ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n" + ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n" + ".inst 0xa042076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n" + ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n" + ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n" + ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n" ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306fd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa04306f5 // ldnt1b { z20.b-z21.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n" - ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n" - ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n" - ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n" - ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n" - ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n" - ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n" - ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n" - ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n" - ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n" - ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n" - ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n" - ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n" - ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n" - ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n" - ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n" + ".inst 0xa0800380 // smopa za0.s, p0/M, p0/M, z28.b, z0.b\n" + ".inst 0xa0880381 // smopa za1.s, p0/M, p0/M, z28.b, z8.b\n" + ".inst 0xa08003a2 // smopa za2.s, p0/M, p0/M, z29.b, z0.b\n" + ".inst 0xa08803a3 // smopa za3.s, p0/M, p0/M, z29.b, z8.b\n" + ".inst 0xa0970040 // smopa za0.s, p0/M, p0/M, z2.b, z23.b\n" + ".inst 0xa09f0041 // smopa za1.s, p0/M, p0/M, z2.b, z31.b\n" + ".inst 0xa0970062 // smopa za2.s, p0/M, p0/M, z3.b, z23.b\n" + ".inst 0xa09f0063 // smopa za3.s, p0/M, p0/M, z3.b, z31.b\n" + ".inst 0xa09001c0 // smopa za0.s, p0/M, p0/M, z14.b, z16.b\n" + ".inst 0xa09801c1 // smopa za1.s, p0/M, p0/M, z14.b, z24.b\n" + ".inst 0xa09001e2 // smopa za2.s, p0/M, p0/M, z15.b, z16.b\n" + ".inst 0xa09801e3 // smopa za3.s, p0/M, p0/M, z15.b, z24.b\n" + ".inst 0xa0940080 // smopa za0.s, p0/M, p0/M, z4.b, z20.b\n" + ".inst 0xa0950081 // smopa za1.s, p0/M, p0/M, z4.b, z21.b\n" + ".inst 0xa09400a2 // smopa za2.s, p0/M, p0/M, z5.b, z20.b\n" + ".inst 0xa09500a3 // smopa za3.s, p0/M, p0/M, z5.b, z21.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa1400776 // ld1b { z22.b, z30.b }, pn9.b/Z, [x27]\n" + ".inst 0xa1400774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #2\n" - ".inst 0xa14006f1 // ld1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" + ".inst 0xa14006e7 // ld1b { z7.b, z15.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #2\n" - ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n" - ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n" - ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n" - ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n" + ".inst 0xa0870280 // smopa za0.s, p0/M, p0/M, z20.b, z7.b\n" + ".inst 0xa08f0281 // smopa za1.s, p0/M, p0/M, z20.b, z15.b\n" + ".inst 0xa0870382 // smopa za2.s, p0/M, p0/M, z28.b, z7.b\n" + ".inst 0xa08f0383 // smopa za3.s, p0/M, p0/M, z28.b, z15.b\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -223,24 +220,24 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" - ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" + ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5d8 // st1w { z24.s-z27.s }, pn9.b, [x14]\n" + ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 23f\n" @@ -248,16 +245,16 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n" - ".inst 0xa061c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" + ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 23f\n" @@ -275,32 +272,32 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "and x20, x22, #0x3\n" "cbz x21, 16f\n" "15:" // Store to output array: Accumulator row 0 loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n" - ".inst 0xa1604350 // st1w { z16.s, z24.s }, p8, [x26]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" + ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" "add x26, x26, x23\n" - ".inst 0xa1604351 // st1w { z17.s, z25.s }, p8, [x26]\n" + ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" "add x26, x26, x23\n" "add x12, x12, #0x4\n" - ".inst 0xa1604352 // st1w { z18.s, z26.s }, p8, [x26]\n" + ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" "add x26, x26, x23\n" "cmp x12, x21, LSL #2\n" - ".inst 0xa1604353 // st1w { z19.s, z27.s }, p8, [x26]\n" + ".inst 0xa1604347 // st1w { z7.s, z15.s }, p8, [x26]\n" "add x26, x26, x23\n" "blt 15b\n" "16:" // Store to output array: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" + ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" "add x26, x26, x23\n" "beq 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n" + ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" "add x26, x26, x23\n" "beq 17f\n" - ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n" + ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" "add x26, x26, x23\n" "17:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -328,30 +325,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in "19:" // Store to output array: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa1604344 // st1w { z4.s, z12.s }, p8, [x26]\n" + ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n" + ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" + ".inst 0xa1604340 // st1w { z0.s, z8.s }, p8, [x26]\n" "add x26, x26, x23\n" "beq 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xa1604345 // st1w { z5.s, z13.s }, p8, [x26]\n" + ".inst 0xa1604341 // st1w { z1.s, z9.s }, p8, [x26]\n" "add x26, x26, x23\n" "beq 20f\n" - ".inst 0xa1604346 // st1w { z6.s, z14.s }, p8, [x26]\n" + ".inst 0xa1604342 // st1w { z2.s, z10.s }, p8, [x26]\n" "20:" // Store to output array: Accumulator row 1 oddments: End "21:" // Store to output array: End "tbz x16, #0, 23f\n" "mov x12, #0x0\n" "cntw x20\n" "22:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa041c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa041c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa042c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c5e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -375,4 +372,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const in } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp index be1106da13..ce10ab30e7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -32,7 +32,7 @@ namespace arm_gemm { // Implementations -void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); +void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL { @@ -40,7 +40,7 @@ public: typedef int8_t operand_type; typedef int32_t result_type; - typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer); + typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer); /* Kernel blocking parameters */ static unsigned int out_height() @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp index d863b6c72a..a23c44b7da 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -32,10 +31,8 @@ namespace arm_gemm { -void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer) +void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation, bool accumulate, int32_t *const accumulator_buffer) { - ARM_COMPUTE_UNUSED(act); - struct KernelArgs { KernelArgs( @@ -94,14 +91,14 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa041c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -119,11 +116,11 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n" - ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n" + "ldnt1w { z17.s }, p0/Z, [x20, x10, LSL #2]\n" + ".inst 0xc0902620 // addha za0.s, p1/M, p1/M, z17.s\n" + ".inst 0xc0902621 // addha za1.s, p1/M, p1/M, z17.s\n" + ".inst 0xc0902622 // addha za2.s, p1/M, p1/M, z17.s\n" + ".inst 0xc0902623 // addha za3.s, p1/M, p1/M, z17.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -146,75 +143,75 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n" - "ldnt1b { z7.b }, p1/Z, [x23]\n" - ".inst 0xa041837c // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa0428360 // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" + "ldnt1b { z12.b }, p1/Z, [x23]\n" + ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "ble 7f\n" "6:" // K loop - ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n" + ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n" - ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n" - ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n" - ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n" - ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n" - "ldnt1b { z7.b }, p1/Z, [x23]\n" - ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n" - ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n" - ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n" - ".inst 0xa041837c // ld1b { z28.b-z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n" - "ldnt1b { z13.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n" - ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n" - ".inst 0xa0428360 // ld1b { z0.b-z3.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z12.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n" - ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n" - ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n" - ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n" - ".inst 0xa0438378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n" + ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n" + ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n" + ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" + ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n" + "ldnt1b { z12.b }, p1/Z, [x23]\n" + ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n" + ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n" + ".inst 0xa1418370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n" + "ldnt1b { z5.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n" + ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n" + ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n" + ".inst 0xa1428363 // ld1b { z3.b, z7.b, z11.b, z15.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z4.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n" + ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n" + ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n" + ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n" + ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z23.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z19.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n" - ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n" - ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n" - ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n" - ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n" - ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n" - ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n" - ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n" - ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n" - ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n" - ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n" - ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n" - ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n" - ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n" - ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n" + ".inst 0xa08c2640 // smopa za0.s, p1/M, p1/M, z18.b, z12.b\n" + ".inst 0xa08c26c1 // smopa za1.s, p1/M, p1/M, z22.b, z12.b\n" + ".inst 0xa08c2742 // smopa za2.s, p1/M, p1/M, z26.b, z12.b\n" + ".inst 0xa08c27c3 // smopa za3.s, p1/M, p1/M, z30.b, z12.b\n" + ".inst 0xa0852600 // smopa za0.s, p1/M, p1/M, z16.b, z5.b\n" + ".inst 0xa0852681 // smopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa0852702 // smopa za2.s, p1/M, p1/M, z24.b, z5.b\n" + ".inst 0xa0852783 // smopa za3.s, p1/M, p1/M, z28.b, z5.b\n" + ".inst 0xa0842460 // smopa za0.s, p1/M, p1/M, z3.b, z4.b\n" + ".inst 0xa08424e1 // smopa za1.s, p1/M, p1/M, z7.b, z4.b\n" + ".inst 0xa0842562 // smopa za2.s, p1/M, p1/M, z11.b, z4.b\n" + ".inst 0xa08425e3 // smopa za3.s, p1/M, p1/M, z15.b, z4.b\n" + ".inst 0xa0932440 // smopa za0.s, p1/M, p1/M, z2.b, z19.b\n" + ".inst 0xa09324c1 // smopa za1.s, p1/M, p1/M, z6.b, z19.b\n" + ".inst 0xa0932542 // smopa za2.s, p1/M, p1/M, z10.b, z19.b\n" + ".inst 0xa09325c3 // smopa za3.s, p1/M, p1/M, z14.b, z19.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa0408370 // ld1b { z16.b-z19.b }, pn8.b/Z, [x27]\n" + ".inst 0xa0408368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #4\n" - "ld1b { z7.b }, p1/Z, [x23]\n" + "ld1b { z15.b }, p1/Z, [x23]\n" "addvl x23, x23, #1\n" - ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n" - ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n" - ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n" - ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n" + ".inst 0xa08f2500 // smopa za0.s, p1/M, p1/M, z8.b, z15.b\n" + ".inst 0xa08f2521 // smopa za1.s, p1/M, p1/M, z9.b, z15.b\n" + ".inst 0xa08f2542 // smopa za2.s, p1/M, p1/M, z10.b, z15.b\n" + ".inst 0xa08f2563 // smopa za3.s, p1/M, p1/M, z11.b, z15.b\n" "bgt 9b\n" "10:" // K oddments: End "tbz x16, #1, 14f\n" @@ -222,25 +219,25 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" - ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" + ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" + ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" + ".inst 0xa060c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 29f\n" @@ -248,12 +245,12 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xa060c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xa061c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n" @@ -275,30 +272,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "and x20, x22, #0x3\n" "cbz x21, 16f\n" "15:" // Store to output array: Accumulator row 0 loop - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - "st1w { z28.s }, p0, [x26]\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + "st1w { z8.s }, p0, [x26]\n" "add x26, x26, x23\n" - "st1w { z29.s }, p0, [x26]\n" + "st1w { z9.s }, p0, [x26]\n" "add x26, x26, x23\n" "add x12, x12, #0x4\n" - "st1w { z30.s }, p0, [x26]\n" + "st1w { z10.s }, p0, [x26]\n" "add x26, x26, x23\n" "cmp x12, x21, LSL #2\n" - "st1w { z31.s }, p0, [x26]\n" + "st1w { z11.s }, p0, [x26]\n" "add x26, x26, x23\n" "blt 15b\n" "16:" // Store to output array: Accumulator row 0 oddments "cbz x20, 17f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" - "st1w { z8.s }, p0, [x26]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + "st1w { z4.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 17f\n" "subs x20, x20, #0x1\n" - "st1w { z9.s }, p0, [x26]\n" + "st1w { z5.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 17f\n" - "st1w { z10.s }, p0, [x26]\n" + "st1w { z6.s }, p0, [x26]\n" "add x26, x26, x23\n" "17:" // Store to output array: Accumulator row 0 oddments: End "subs x25, x25, x22\n" @@ -310,30 +307,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "and x20, x22, #0x3\n" "cbz x21, 19f\n" "18:" // Store to output array: Accumulator row 1 loop - ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" - "st1w { z0.s }, p0, [x26]\n" + ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x23\n" - "st1w { z1.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x23\n" "add x12, x12, #0x4\n" - "st1w { z2.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x23\n" "cmp x12, x21, LSL #2\n" - "st1w { z3.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x23\n" "blt 18b\n" "19:" // Store to output array: Accumulator row 1 oddments "cbz x20, 20f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + "st1w { z20.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 20f\n" "subs x20, x20, #0x1\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z21.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 20f\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z22.s }, p0, [x26]\n" "add x26, x26, x23\n" "20:" // Store to output array: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -345,30 +342,30 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "and x20, x22, #0x3\n" "cbz x21, 22f\n" "21:" // Store to output array: Accumulator row 2 loop - ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" + "st1w { z24.s }, p0, [x26]\n" "add x26, x26, x23\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z25.s }, p0, [x26]\n" "add x26, x26, x23\n" "add x12, x12, #0x4\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z26.s }, p0, [x26]\n" "add x26, x26, x23\n" "cmp x12, x21, LSL #2\n" - "st1w { z19.s }, p0, [x26]\n" + "st1w { z27.s }, p0, [x26]\n" "add x26, x26, x23\n" "blt 21b\n" "22:" // Store to output array: Accumulator row 2 oddments "cbz x20, 23f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n" - "st1w { z0.s }, p0, [x26]\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 23f\n" "subs x20, x20, #0x1\n" - "st1w { z1.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 23f\n" - "st1w { z2.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x23\n" "23:" // Store to output array: Accumulator row 2 oddments: End "subs x25, x25, x22\n" @@ -380,44 +377,44 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in "and x20, x20, #0x3\n" "cbz x21, 25f\n" "24:" // Store to output array: Accumulator row 3 loop - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - "st1w { z12.s }, p0, [x26]\n" + ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" + "st1w { z16.s }, p0, [x26]\n" "add x26, x26, x23\n" - "st1w { z13.s }, p0, [x26]\n" + "st1w { z17.s }, p0, [x26]\n" "add x26, x26, x23\n" "add x12, x12, #0x4\n" - "st1w { z14.s }, p0, [x26]\n" + "st1w { z18.s }, p0, [x26]\n" "add x26, x26, x23\n" "cmp x12, x21, LSL #2\n" - "st1w { z15.s }, p0, [x26]\n" + "st1w { z19.s }, p0, [x26]\n" "add x26, x26, x23\n" "blt 24b\n" "25:" // Store to output array: Accumulator row 3 oddments "cbz x20, 26f\n" "subs x20, x20, #0x1\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - "st1w { z16.s }, p0, [x26]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + "st1w { z12.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 26f\n" "subs x20, x20, #0x1\n" - "st1w { z17.s }, p0, [x26]\n" + "st1w { z13.s }, p0, [x26]\n" "add x26, x26, x23\n" "beq 26f\n" - "st1w { z18.s }, p0, [x26]\n" + "st1w { z14.s }, p0, [x26]\n" "26:" // Store to output array: Accumulator row 3 oddments: End "27:" // Store to output array: End "tbz x16, #0, 29f\n" "mov x12, #0x0\n" "cntw x20\n" "28:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -441,4 +438,3 @@ void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const in } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp index c7bd38d905..fb84883913 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp index d868ed2b67..96247d2db5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n" - ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa042c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa041c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840501 // mova za1h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa042c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x13, x13, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa01cc299 // ldnt1w { z24.s-z27.s }, p8/Z, [x20, x28, LSL #2]\n" - ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n" - ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n" - ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n" - ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n" + ".inst 0xa11cc289 // ldnt1w { z1.s, z5.s, z9.s, z13.s }, p8/Z, [x20, x28, LSL #2]\n" + ".inst 0xc0902420 // addha za0.s, p1/M, p1/M, z1.s\n" + ".inst 0xc09024a1 // addha za1.s, p1/M, p1/M, z5.s\n" + ".inst 0xc0902522 // addha za2.s, p1/M, p1/M, z9.s\n" + ".inst 0xc09025a3 // addha za3.s, p1/M, p1/M, z13.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x28\n" "mov x21, x9\n" @@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "madd x23, x28, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - "ld1b { z10.b }, p1/Z, [x25]\n" - ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" - "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n" - ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n" - ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n" + "ld1b { z20.b }, p1/Z, [x25]\n" + ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" + "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n" + ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "ble 7f\n" "6:" // K loop - ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n" + ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n" - "ld1b { z10.b }, p1/Z, [x25]\n" - ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n" - ".inst 0xa04086fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" - ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n" - ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n" - ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n" - "ld1b { z16.b }, p1/Z, [x25, #1, MUL VL]\n" - ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n" - ".inst 0xa04186ed // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n" - ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n" - ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n" - "ld1b { z21.b }, p1/Z, [x25, #2, MUL VL]\n" - ".inst 0xa04286f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" - ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n" - ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n" - ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n" - ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n" - "ld1b { z19.b }, p1/Z, [x25, #3, MUL VL]\n" + ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n" + ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n" + "ld1b { z20.b }, p1/Z, [x25]\n" + ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n" + ".inst 0xa04086e5 // ldnt1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" + ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n" + ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n" + "ld1b { z11.b }, p1/Z, [x25, #1, MUL VL]\n" + ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n" + ".inst 0xa04186f9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n" + ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n" + ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n" + "ld1b { z2.b }, p1/Z, [x25, #2, MUL VL]\n" + ".inst 0xa04286fd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x23, #0x8, MUL VL]\n" + ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n" + ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n" + ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n" + ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n" + "ld1b { z14.b }, p1/Z, [x25, #3, MUL VL]\n" "addvl x25, x25, #4\n" - ".inst 0xa04386e1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" + ".inst 0xa04386f1 // ldnt1b { z16.b-z19.b }, pn9.b/Z, [x23, #0xc, MUL VL]\n" "addvl x23, x23, #16\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n" - ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n" - ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n" - ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n" - ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n" - ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n" - ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n" - ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n" - ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n" - ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n" - ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n" - ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n" - ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n" - ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n" + ".inst 0xa1a42680 // umopa za0.s, p1/M, p1/M, z20.b, z4.b\n" + ".inst 0xa1a52681 // umopa za1.s, p1/M, p1/M, z20.b, z5.b\n" + ".inst 0xa1a62682 // umopa za2.s, p1/M, p1/M, z20.b, z6.b\n" + ".inst 0xa1a72683 // umopa za3.s, p1/M, p1/M, z20.b, z7.b\n" + ".inst 0xa1b82560 // umopa za0.s, p1/M, p1/M, z11.b, z24.b\n" + ".inst 0xa1b92561 // umopa za1.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa1ba2562 // umopa za2.s, p1/M, p1/M, z11.b, z26.b\n" + ".inst 0xa1bb2563 // umopa za3.s, p1/M, p1/M, z11.b, z27.b\n" + ".inst 0xa1bc2440 // umopa za0.s, p1/M, p1/M, z2.b, z28.b\n" + ".inst 0xa1bd2441 // umopa za1.s, p1/M, p1/M, z2.b, z29.b\n" + ".inst 0xa1be2442 // umopa za2.s, p1/M, p1/M, z2.b, z30.b\n" + ".inst 0xa1bf2443 // umopa za3.s, p1/M, p1/M, z2.b, z31.b\n" + ".inst 0xa1b025c0 // umopa za0.s, p1/M, p1/M, z14.b, z16.b\n" + ".inst 0xa1b125c1 // umopa za1.s, p1/M, p1/M, z14.b, z17.b\n" + ".inst 0xa1b225c2 // umopa za2.s, p1/M, p1/M, z14.b, z18.b\n" + ".inst 0xa1b325c3 // umopa za3.s, p1/M, p1/M, z14.b, z19.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - "ld1b { z10.b }, p1/Z, [x25]\n" + "ld1b { z16.b }, p1/Z, [x25]\n" "subs x21, x21, #0x1\n" "addvl x25, x25, #1\n" - ".inst 0xa04086fc // ld1b { z28.b-z31.b }, pn9.b/Z, [x23]\n" + ".inst 0xa04086e4 // ld1b { z4.b-z7.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #4\n" - ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n" - ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n" - ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n" - ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n" + ".inst 0xa1a42600 // umopa za0.s, p1/M, p1/M, z16.b, z4.b\n" + ".inst 0xa1a52601 // umopa za1.s, p1/M, p1/M, z16.b, z5.b\n" + ".inst 0xa1a62602 // umopa za2.s, p1/M, p1/M, z16.b, z6.b\n" + ".inst 0xa1a72603 // umopa za3.s, p1/M, p1/M, z16.b, z7.b\n" "bgt 9b\n" "10:" // K oddments: End - "ld1w { z14.s }, p1/Z, [x25]\n" + "ld1w { z15.s }, p1/Z, [x25]\n" "addvl x25, x25, #1\n" - ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n" - ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n" + ".inst 0xc09125e0 // addva za0.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e1 // addva za1.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n" + ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n" "tbz x14, #1, 14f\n" "tbz x14, #0, 12f\n" "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5b8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x13]\n" - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" - ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n" + ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xa041c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" + ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n" + ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n" "addvl x13, x13, #16\n" - ".inst 0xa061c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x4, MUL VL]\n" - ".inst 0xa062c578 // st1w { z24.s-z27.s }, pn9.b, [x11, #0x8, MUL VL]\n" - ".inst 0xa063c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0xc, MUL VL]\n" + ".inst 0xa061c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0x4, MUL VL]\n" + ".inst 0xa062c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x8, MUL VL]\n" + ".inst 0xa063c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0xc, MUL VL]\n" "addvl x11, x11, #16\n" "blt 11b\n" "b 21f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n" - ".inst 0xa060c57c // st1w { z28.s-z31.s }, pn9.b, [x11]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa061c560 // st1w { z0.s-z3.s }, pn9.b, [x11, #0x4, MUL VL]\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" + ".inst 0xa060c564 // st1w { z4.s-z7.s }, pn9.b, [x11]\n" + ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" + ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n" + ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0x8, MUL VL]\n" - ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n" + ".inst 0xa062c56c // st1w { z12.s-z15.s }, pn9.b, [x11, #0x8, MUL VL]\n" + ".inst 0xa063c568 // st1w { z8.s-z11.s }, pn9.b, [x11, #0xc, MUL VL]\n" "addvl x11, x11, #16\n" "blt 13b\n" "b 21f\n" @@ -277,17 +276,17 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "ldr x24, [%x[args], %[offsetof_C]]\n" "add x24, x24, x28\n" // C += n "sub x23, x10, x9\n" - "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x22, [%x[args], %[offsetof_ldcb]]\n" "madd x24, x9, x22, x24\n" // C += m * ldc - "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x14, #2, 15f\n" @@ -295,10 +294,10 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "add x21, x21, x28\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n" + ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa040c284 // ld1w { z4.s-z7.s }, p8/Z, [x20]\n" + ".inst 0xa040c28c // ld1w { z12.s-z15.s }, p8/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x20\n" "whilelt p0.b, x28, x27\n" @@ -311,22 +310,22 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "16:" // Store to output array: Accumulator row 0 loop ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n" ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n" - ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n" + ".inst 0xc1a4a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n" ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n" ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n" - ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n" - ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n" + ".inst 0xc1a5a41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n" + ".inst 0xc1a6a416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n" "add x12, x12, #0x2\n" "cmp x12, x21, LSL #1\n" - ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n" - ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n" - ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n" - ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n" - ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n" - ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n" - ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n" - ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n" - ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n" + ".inst 0xc1a7a410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n" + ".inst 0xc1aca23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n" + ".inst 0xc1ada23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n" + ".inst 0xc1aea236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n" + ".inst 0xc1afa230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n" + ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n" + ".inst 0xc1a0a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z0.s\n" + ".inst 0xc1a0a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z0.s\n" + ".inst 0xc1a0a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z0.s\n" ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n" ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n" "uzp1 z19.b, z26.b, z28.b\n" @@ -344,29 +343,29 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" - ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n" + ".inst 0xc086000a // mova { z10.s-z11.s }, za0h.s[x12, 0:1]\n" ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n" - ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n" - ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n" - ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n" - ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n" - ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n" - ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n" - ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n" - ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n" - ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n" - ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n" - ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n" - ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n" - ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n" - ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n" - ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n" - ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n" - "uzp1 z23.b, z2.b, z24.b\n" - ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n" + ".inst 0xc1a4a40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z4.s\n" + ".inst 0xc086009a // mova { z26.s-z27.s }, za2h.s[x12, 0:1]\n" + ".inst 0xc08600de // mova { z30.s-z31.s }, za3h.s[x12, 0:1]\n" + ".inst 0xc1a5a418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n" + ".inst 0xc1a6a41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z6.s\n" + ".inst 0xc1a7a41e // sqdmulh { z30.s-z31.s }, { z30.s-z31.s }, z7.s\n" + ".inst 0xc1aca22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z12.s\n" + ".inst 0xc1ada238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n" + ".inst 0xc1aea23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z14.s\n" + ".inst 0xc1afa23e // srshl { z30.s-z31.s }, { z30.s-z31.s }, z15.s\n" + ".inst 0xc1a0a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z0.s\n" + ".inst 0xc1a0a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z0.s\n" + ".inst 0xc1a0a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z0.s\n" + ".inst 0xc1a0a31e // add { z30.s-z31.s }, { z30.s-z31.s }, z0.s\n" ".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n" - "uzp1 z16.b, z16.b, z10.b\n" - "uzp1 z16.b, z23.b, z16.b\n" + ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n" + "uzp1 z17.b, z10.b, z24.b\n" + ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n" + ".inst 0xc1b4c6be // sclamp { z30.s-z31.s }, z21.s, z20.s\n" + "uzp1 z16.b, z26.b, z30.b\n" + "uzp1 z16.b, z17.b, z16.b\n" "st1b { z16.b }, p0, [x24]\n" "18:" // Store to output array: Accumulator row 0 oddments: End "19:" // Store to output array: End @@ -374,14 +373,14 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "20:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n" - ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa041c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" - ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n" + ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa043c5a0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n" + ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x13, x13, #16\n" @@ -405,4 +404,3 @@ void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uin } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp index 123405bd17..f8c375f9f5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp index cb0e9521e3..9a59799529 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa041c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n" - ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa040c5e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n" + ".inst 0xa041c5e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - ".inst 0xa00a4295 // ldnt1w { z20.s-z21.s }, p8/Z, [x20, x10, LSL #2]\n" - ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n" - ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n" - ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n" - ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n" + ".inst 0xa00a4299 // ldnt1w { z24.s-z25.s }, p8/Z, [x20, x10, LSL #2]\n" + ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n" + ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n" + ".inst 0xc0902702 // addha za2.s, p1/M, p1/M, z24.s\n" + ".inst 0xc0902723 // addha za3.s, p1/M, p1/M, z25.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -152,75 +151,75 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" - ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" - ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n" + ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" + ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "ble 7f\n" "6:" // K loop - ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n" + ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n" - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" - ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n" - ".inst 0xa04006f1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" - ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n" - ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n" - ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n" - ".inst 0xa041076e // ld1b { z14.b-z15.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" - ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n" - ".inst 0xa04106e9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" - ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n" - ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n" - ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n" - ".inst 0xa0420760 // ld1b { z0.b-z1.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa14206fc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" - ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n" - ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n" - ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n" - ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n" - ".inst 0xa0430764 // ld1b { z4.b-z5.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" + ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n" + ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n" + ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa1400763 // ld1b { z3.b, z11.b }, pn9.b/Z, [x27]\n" + ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n" + ".inst 0xa14006f9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x23]\n" + ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n" + ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n" + ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n" + ".inst 0xa1410774 // ld1b { z20.b, z28.b }, pn9.b/Z, [x27, #0x2, MUL VL]\n" + ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n" + ".inst 0xa04106f7 // ldnt1b { z22.b-z23.b }, pn9.b/Z, [x23, #0x2, MUL VL]\n" + ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n" + ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n" + ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n" + ".inst 0xa1420775 // ld1b { z21.b, z29.b }, pn9.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa14206f8 // ldnt1b { z16.b, z24.b }, pn9.b/Z, [x23, #0x4, MUL VL]\n" + ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n" + ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n" + ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n" + ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n" + ".inst 0xa1430765 // ld1b { z5.b, z13.b }, pn9.b/Z, [x27, #0x6, MUL VL]\n" "addvl x27, x27, #8\n" - ".inst 0xa14306ea // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" + ".inst 0xa14306ef // ldnt1b { z7.b, z15.b }, pn9.b/Z, [x23, #0x6, MUL VL]\n" "addvl x23, x23, #8\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n" - ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n" - ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n" - ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n" - ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n" - ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n" - ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n" - ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n" - ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n" - ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n" - ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n" - ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n" - ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n" - ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n" + ".inst 0xa1b12460 // umopa za0.s, p1/M, p1/M, z3.b, z17.b\n" + ".inst 0xa1b92461 // umopa za1.s, p1/M, p1/M, z3.b, z25.b\n" + ".inst 0xa1b12562 // umopa za2.s, p1/M, p1/M, z11.b, z17.b\n" + ".inst 0xa1b92563 // umopa za3.s, p1/M, p1/M, z11.b, z25.b\n" + ".inst 0xa1b62680 // umopa za0.s, p1/M, p1/M, z20.b, z22.b\n" + ".inst 0xa1b72681 // umopa za1.s, p1/M, p1/M, z20.b, z23.b\n" + ".inst 0xa1b62782 // umopa za2.s, p1/M, p1/M, z28.b, z22.b\n" + ".inst 0xa1b72783 // umopa za3.s, p1/M, p1/M, z28.b, z23.b\n" + ".inst 0xa1b026a0 // umopa za0.s, p1/M, p1/M, z21.b, z16.b\n" + ".inst 0xa1b826a1 // umopa za1.s, p1/M, p1/M, z21.b, z24.b\n" + ".inst 0xa1b027a2 // umopa za2.s, p1/M, p1/M, z29.b, z16.b\n" + ".inst 0xa1b827a3 // umopa za3.s, p1/M, p1/M, z29.b, z24.b\n" + ".inst 0xa1a724a0 // umopa za0.s, p1/M, p1/M, z5.b, z7.b\n" + ".inst 0xa1af24a1 // umopa za1.s, p1/M, p1/M, z5.b, z15.b\n" + ".inst 0xa1a725a2 // umopa za2.s, p1/M, p1/M, z13.b, z7.b\n" + ".inst 0xa1af25a3 // umopa za3.s, p1/M, p1/M, z13.b, z15.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop - ".inst 0xa040077e // ld1b { z30.b-z31.b }, pn9.b/Z, [x27]\n" + ".inst 0xa1400773 // ld1b { z19.b, z27.b }, pn9.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #2\n" ".inst 0xa04006f0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x23]\n" "addvl x23, x23, #2\n" - ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n" - ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n" - ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n" - ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n" + ".inst 0xa1b02660 // umopa za0.s, p1/M, p1/M, z19.b, z16.b\n" + ".inst 0xa1b12661 // umopa za1.s, p1/M, p1/M, z19.b, z17.b\n" + ".inst 0xa1b02762 // umopa za2.s, p1/M, p1/M, z27.b, z16.b\n" + ".inst 0xa1b12763 // umopa za3.s, p1/M, p1/M, z27.b, z17.b\n" "bgt 9b\n" "10:" // K oddments: End ".inst 0xa040476e // ld1w { z14.s-z15.s }, pn9.b/Z, [x27]\n" @@ -234,25 +233,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15]\n" - ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n" - ".inst 0xa041c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" - ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xa042c5f8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n" - ".inst 0xa043c5ec // ld1w { z12.s-z15.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n" + ".inst 0xa040c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" + ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa041c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" + ".inst 0xa042c5f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa043c5fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14]\n" + ".inst 0xa060c5c0 // st1w { z0.s-z3.s }, pn9.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c5dc // st1w { z28.s-z31.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 24f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" - ".inst 0xa060c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14]\n" - ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" - ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" - ".inst 0xa061c5c4 // st1w { z4.s-z7.s }, pn9.b, [x14, #0x4, MUL VL]\n" + ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xa060c5cc // st1w { z12.s-z15.s }, pn9.b, [x14]\n" + ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n" + ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" + ".inst 0xa061c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c5c8 // st1w { z8.s-z11.s }, pn9.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c5cc // st1w { z12.s-z15.s }, pn9.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c5d0 // st1w { z16.s-z19.s }, pn9.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c5d4 // st1w { z20.s-z23.s }, pn9.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 24f\n" @@ -277,13 +276,13 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "ldr x26, [%x[args], %[offsetof_C]]\n" "add x26, x26, x10\n" // C += n "sub x25, x13, x11\n" - "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x24, [%x[args], %[offsetof_ldcb]]\n" "madd x26, x11, x24, x26\n" // C += m * ldc - "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" - "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x16, #2, 15f\n" @@ -291,10 +290,10 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "add x21, x21, x10\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n" + ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - ".inst 0xa0404280 // ld1w { z0.s-z1.s }, p8/Z, [x20]\n" + ".inst 0xa0404282 // ld1w { z2.s-z3.s }, p8/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x23\n" "whilelt p0.h, x10, x9\n" @@ -305,26 +304,26 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "and x20, x22, #0x3\n" "cbz x21, 17f\n" "16:" // Store to output array: Accumulator row 0 loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" - ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n" + ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n" + ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" "add x12, x12, #0x4\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n" - ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" - ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" - ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n" - ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n" - "uzp1 z16.h, z12.h, z28.h\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" + ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n" + "uzp1 z16.h, z4.h, z8.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z13.h, z29.h\n" - "uzp1 z17.h, z14.h, z30.h\n" + "uzp1 z16.h, z5.h, z9.h\n" + "uzp1 z17.h, z6.h, z10.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z15.h, z31.h\n" + "uzp1 z16.h, z7.h, z11.h\n" "st1b { z17.h }, p0, [x26]\n" "add x26, x26, x24\n" "st1b { z16.h }, p0, [x26]\n" @@ -332,27 +331,27 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" - ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n" - ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" - ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n" + ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a0ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n" + ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" - ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" - ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n" - ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n" - ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n" - "uzp1 z16.h, z28.h, z12.h\n" + ".inst 0xc1a2aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" + ".inst 0xc1a3aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z3.s\n" + ".inst 0xc1aeab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z14.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1b8cf28 // sclamp { z8.s-z11.s }, z25.s, z24.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" + "uzp1 z16.h, z8.h, z4.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" "subs x20, x20, #0x1\n" - "uzp1 z16.h, z29.h, z13.h\n" + "uzp1 z16.h, z9.h, z5.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" - "uzp1 z16.h, z30.h, z14.h\n" + "uzp1 z16.h, z10.h, z6.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "18:" // Store to output array: Accumulator row 0 oddments: End @@ -367,25 +366,25 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 1 loop ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" - ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" - ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z1.s\n" "add x12, x12, #0x4\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" - ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" - ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n" - ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z14.s\n" ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" - ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n" - "uzp1 z16.h, z4.h, z16.h\n" + ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n" + "uzp1 z16.h, z4.h, z20.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z5.h, z17.h\n" - "uzp1 z17.h, z6.h, z18.h\n" + "uzp1 z16.h, z5.h, z21.h\n" + "uzp1 z17.h, z6.h, z22.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" - "uzp1 z16.h, z7.h, z19.h\n" + "uzp1 z16.h, z7.h, z23.h\n" "st1b { z17.h }, p0, [x26]\n" "add x26, x26, x24\n" "st1b { z16.h }, p0, [x26]\n" @@ -393,27 +392,27 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin "blt 19b\n" "20:" // Store to output array: Accumulator row 1 oddments "cbz x20, 21f\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n" - ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n" - ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc1a0ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1a1ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n" - ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" - ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n" - ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n" - ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n" + ".inst 0xc1a2aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" + ".inst 0xc1a3aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n" + ".inst 0xc1aeab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n" + ".inst 0xc1aeab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z14.s\n" + ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n" ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n" - "uzp1 z16.h, z20.h, z16.h\n" + "uzp1 z16.h, z4.h, z16.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" "subs x20, x20, #0x1\n" - "uzp1 z16.h, z21.h, z17.h\n" + "uzp1 z16.h, z5.h, z17.h\n" "st1b { z16.h }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" - "uzp1 z16.h, z22.h, z18.h\n" + "uzp1 z16.h, z6.h, z18.h\n" "st1b { z16.h }, p0, [x26]\n" "21:" // Store to output array: Accumulator row 1 oddments: End "22:" // Store to output array: End @@ -452,4 +451,3 @@ void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uin } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp index 2e61cf49a8..04d19324c5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,7 +23,7 @@ */ #pragma once -#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SME2 #include #include "../std_transforms_sme.hpp" @@ -83,12 +83,11 @@ public: StdTransformsSME transforms = {}; - cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *ci) + cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *) { - ARM_COMPUTE_UNUSED(ci); } }; } // namespace arm_gemm -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp index 8f8886b876..0f3346e65e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp @@ -21,7 +21,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifdef __ARM_FEATURE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 #include "arm_gemm.hpp" @@ -100,14 +99,14 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "1:" // Initial accumulator load from buffer: Loop - ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" - ".inst 0xa041c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n" - ".inst 0xa042c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" - ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n" - ".inst 0xa043c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa040c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n" + ".inst 0xa041c1f8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n" + ".inst 0xa042c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" + ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n" + ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -125,11 +124,11 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "ldr x20, [%x[args], %[offsetof_bias]]\n" ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n" "cbz x20, 5f\n" - "ldnt1w { z15.s }, p0/Z, [x20, x10, LSL #2]\n" - ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n" - ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n" + "ldnt1w { z8.s }, p0/Z, [x20, x10, LSL #2]\n" + ".inst 0xc0902500 // addha za0.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902501 // addha za1.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902502 // addha za2.s, p1/M, p1/M, z8.s\n" + ".inst 0xc0902503 // addha za3.s, p1/M, p1/M, z8.s\n" "4:" // Prepare accumulators: Test for last block "mov x20, x10\n" "mov x21, x11\n" @@ -152,107 +151,107 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "madd x23, x10, x20, x23\n" // bptr = B + n * kstride_bytes "cbz x22, 8f\n" "subs x22, x22, #0x1\n" - ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" - "ldnt1b { z0.b }, p1/Z, [x23]\n" - ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n" + "ldnt1b { z14.b }, p1/Z, [x23]\n" + ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "ble 7f\n" "6:" // K loop - ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n" + ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n" "subs x22, x22, #0x1\n" - ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n" - ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" - ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n" - "ldnt1b { z0.b }, p1/Z, [x23]\n" - ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n" - ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n" - ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n" - ".inst 0xa1418373 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" - ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n" - "ldnt1b { z9.b }, p1/Z, [x23, #1, MUL VL]\n" - ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n" - ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n" - ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n" - ".inst 0xa1428370 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" - "ldnt1b { z21.b }, p1/Z, [x23, #2, MUL VL]\n" - ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n" - ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n" - ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n" - ".inst 0xa1438362 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" + ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n" + ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n" + ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n" + ".inst 0xa0408364 // ld1b { z4.b-z7.b }, pn8.b/Z, [x27]\n" + ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n" + "ldnt1b { z14.b }, p1/Z, [x23]\n" + ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n" + ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n" + ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n" + ".inst 0xa0418374 // ld1b { z20.b-z23.b }, pn8.b/Z, [x27, #0x4, MUL VL]\n" + ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n" + "ldnt1b { z31.b }, p1/Z, [x23, #1, MUL VL]\n" + ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n" + ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n" + ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n" + ".inst 0xa0428378 // ld1b { z24.b-z27.b }, pn8.b/Z, [x27, #0x8, MUL VL]\n" + "ldnt1b { z13.b }, p1/Z, [x23, #2, MUL VL]\n" + ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n" + ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n" + ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n" + ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n" + ".inst 0xa0438368 // ld1b { z8.b-z11.b }, pn8.b/Z, [x27, #0xc, MUL VL]\n" "addvl x27, x27, #16\n" - "ldnt1b { z12.b }, p1/Z, [x23, #3, MUL VL]\n" + "ldnt1b { z29.b }, p1/Z, [x23, #3, MUL VL]\n" "addvl x23, x23, #4\n" "bgt 6b\n" "7:" // K loop tail - ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n" - ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n" - ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n" - ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n" - ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n" - ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n" - ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n" - ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n" - ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n" - ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n" - ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n" - ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n" - ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n" - ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n" + ".inst 0xa1ae2480 // umopa za0.s, p1/M, p1/M, z4.b, z14.b\n" + ".inst 0xa1ae24a1 // umopa za1.s, p1/M, p1/M, z5.b, z14.b\n" + ".inst 0xa1ae24c2 // umopa za2.s, p1/M, p1/M, z6.b, z14.b\n" + ".inst 0xa1ae24e3 // umopa za3.s, p1/M, p1/M, z7.b, z14.b\n" + ".inst 0xa1bf2680 // umopa za0.s, p1/M, p1/M, z20.b, z31.b\n" + ".inst 0xa1bf26a1 // umopa za1.s, p1/M, p1/M, z21.b, z31.b\n" + ".inst 0xa1bf26c2 // umopa za2.s, p1/M, p1/M, z22.b, z31.b\n" + ".inst 0xa1bf26e3 // umopa za3.s, p1/M, p1/M, z23.b, z31.b\n" + ".inst 0xa1ad2700 // umopa za0.s, p1/M, p1/M, z24.b, z13.b\n" + ".inst 0xa1ad2721 // umopa za1.s, p1/M, p1/M, z25.b, z13.b\n" + ".inst 0xa1ad2742 // umopa za2.s, p1/M, p1/M, z26.b, z13.b\n" + ".inst 0xa1ad2763 // umopa za3.s, p1/M, p1/M, z27.b, z13.b\n" + ".inst 0xa1bd2500 // umopa za0.s, p1/M, p1/M, z8.b, z29.b\n" + ".inst 0xa1bd2521 // umopa za1.s, p1/M, p1/M, z9.b, z29.b\n" + ".inst 0xa1bd2542 // umopa za2.s, p1/M, p1/M, z10.b, z29.b\n" + ".inst 0xa1bd2563 // umopa za3.s, p1/M, p1/M, z11.b, z29.b\n" "8:" // K oddments "cbz x21, 10f\n" "9:" // K oddments: Loop ".inst 0xa1408372 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x27]\n" "subs x21, x21, #0x1\n" "addvl x27, x27, #4\n" - "ld1b { z0.b }, p1/Z, [x23]\n" + "ld1b { z15.b }, p1/Z, [x23]\n" "addvl x23, x23, #1\n" - ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n" - ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n" - ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n" - ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n" + ".inst 0xa1af2640 // umopa za0.s, p1/M, p1/M, z18.b, z15.b\n" + ".inst 0xa1af26c1 // umopa za1.s, p1/M, p1/M, z22.b, z15.b\n" + ".inst 0xa1af2742 // umopa za2.s, p1/M, p1/M, z26.b, z15.b\n" + ".inst 0xa1af27c3 // umopa za3.s, p1/M, p1/M, z30.b, z15.b\n" "bgt 9b\n" "10:" // K oddments: End - ".inst 0xa040c360 // ld1w { z0.s-z3.s }, pn8.b/Z, [x27]\n" + ".inst 0xa140c363 // ld1w { z3.s, z7.s, z11.s, z15.s }, pn8.b/Z, [x27]\n" "addvl x27, x27, #4\n" - ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n" - ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n" - ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n" - ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n" + ".inst 0xc0912460 // addva za0.s, p1/M, p1/M, z3.s\n" + ".inst 0xc09124e1 // addva za1.s, p1/M, p1/M, z7.s\n" + ".inst 0xc0912562 // addva za2.s, p1/M, p1/M, z11.s\n" + ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n" "tbz x16, #1, 14f\n" "tbz x16, #0, 12f\n" "mov x12, #0x0\n" "cntw x20\n" "11:" // Store to partial result buffer: Store and refill: Loop - ".inst 0xa040c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n" - ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n" - ".inst 0xa041c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" - ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n" - ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xa040c1ec // ld1w { z12.s-z15.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" + ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n" + ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" + ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" + ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" + ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" + ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n" ".inst 0xa042c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n" - ".inst 0xa043c1f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n" + ".inst 0xa043c1e8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n" + ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" "addvl x15, x15, #16\n" - ".inst 0xa061c1c8 // st1w { z8.s-z11.s }, pn8.b, [x14, #0x4, MUL VL]\n" - ".inst 0xa062c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa061c1dc // st1w { z28.s-z31.s }, pn8.b, [x14, #0x4, MUL VL]\n" + ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 11b\n" "b 30f\n" @@ -260,16 +259,16 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "mov x12, #0x0\n" "cntw x20\n" "13:" // Store to partial result buffer: Store only: Loop - ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n" ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n" - ".inst 0xa060c1d0 // st1w { z16.s-z19.s }, pn8.b, [x14]\n" - ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n" - ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n" + ".inst 0xa060c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14]\n" + ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n" + ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" ".inst 0xa061c1cc // st1w { z12.s-z15.s }, pn8.b, [x14, #0x4, MUL VL]\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" - ".inst 0xa062c1d4 // st1w { z20.s-z23.s }, pn8.b, [x14, #0x8, MUL VL]\n" - ".inst 0xa063c1d8 // st1w { z24.s-z27.s }, pn8.b, [x14, #0xc, MUL VL]\n" + ".inst 0xa062c1c4 // st1w { z4.s-z7.s }, pn8.b, [x14, #0x8, MUL VL]\n" + ".inst 0xa063c1c0 // st1w { z0.s-z3.s }, pn8.b, [x14, #0xc, MUL VL]\n" "addvl x14, x14, #16\n" "blt 13b\n" "b 30f\n" @@ -277,22 +276,22 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "ldr x26, [%x[args], %[offsetof_C]]\n" "add x26, x26, x10\n" // C += n "sub x25, x13, x11\n" - "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" + "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n" "ldr x24, [%x[args], %[offsetof_ldcb]]\n" "madd x26, x11, x24, x26\n" // C += m * ldc - "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" - "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" - "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" - "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" + "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n" + "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n" + "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n" + "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n" "tbz x16, #2, 15f\n" "ldr w21, [%x[args], %[offsetof_n_0]]\n" "add x21, x21, x10\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n" "add x20, x20, x21, LSL #2\n" - "ld1w { z8.s }, p0/Z, [x20]\n" + "ld1w { z2.s }, p0/Z, [x20]\n" "ldr x20, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n" "add x20, x20, x21, LSL #2\n" - "ld1w { z7.s }, p0/Z, [x20]\n" + "ld1w { z1.s }, p0/Z, [x20]\n" "15:" // Store to output array: Load per-channel parameters: End "cntw x23\n" "whilelt p0.s, x10, x9\n" @@ -303,30 +302,30 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "and x20, x22, #0x3\n" "cbz x21, 17f\n" "16:" // Store to output array: Accumulator row 0 loop - ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n" - ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n" + ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" + ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" + ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n" - "st1b { z12.s }, p0, [x26]\n" + ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n" + ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n" + "st1b { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z13.s }, p0, [x26]\n" + "st1b { z17.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z14.s }, p0, [x26]\n" + "st1b { z18.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z15.s }, p0, [x26]\n" + "st1b { z19.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 16b\n" "17:" // Store to output array: Accumulator row 0 oddments "cbz x20, 18f\n" ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n" - ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n" + ".inst 0xc1a2ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" - ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" - ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n" + ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n" + ".inst 0xc1a0ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z0.s\n" + ".inst 0xc1b4ceb0 // sclamp { z16.s-z19.s }, z21.s, z20.s\n" "st1b { z16.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 18f\n" @@ -347,38 +346,38 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "and x20, x22, #0x3\n" "cbz x21, 20f\n" "19:" // Store to output array: Accumulator row 1 loop - ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n" - ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n" - ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n" - "st1b { z16.s }, p0, [x26]\n" + ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1b { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z17.s }, p0, [x26]\n" + "st1b { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z18.s }, p0, [x26]\n" + "st1b { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z19.s }, p0, [x26]\n" + "st1b { z7.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 19b\n" "20:" // Store to output array: Accumulator row 1 oddments "cbz x20, 21f\n" - ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n" - ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n" + ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n" + ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n" - ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n" - ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n" - "st1b { z28.s }, p0, [x26]\n" + ".inst 0xc1a1aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n" + ".inst 0xc1a0ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n" + ".inst 0xc1b4cea4 // sclamp { z4.s-z7.s }, z21.s, z20.s\n" + "st1b { z4.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" "subs x20, x20, #0x1\n" - "st1b { z29.s }, p0, [x26]\n" + "st1b { z5.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 21f\n" - "st1b { z30.s }, p0, [x26]\n" + "st1b { z6.s }, p0, [x26]\n" "add x26, x26, x24\n" "21:" // Store to output array: Accumulator row 1 oddments: End "subs x25, x25, x22\n" @@ -391,30 +390,30 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "and x20, x22, #0x3\n" "cbz x21, 23f\n" "22:" // Store to output array: Accumulator row 2 loop - ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n" - ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n" + ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n" + ".inst 0xc1a2ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n" + ".inst 0xc1a1aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n" - ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n" - "st1b { z24.s }, p0, [x26]\n" + ".inst 0xc1a0ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z0.s\n" + ".inst 0xc1b4cea8 // sclamp { z8.s-z11.s }, z21.s, z20.s\n" + "st1b { z8.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z25.s }, p0, [x26]\n" + "st1b { z9.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z26.s }, p0, [x26]\n" + "st1b { z10.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z27.s }, p0, [x26]\n" + "st1b { z11.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 22b\n" "23:" // Store to output array: Accumulator row 2 oddments "cbz x20, 24f\n" ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n" - ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n" + ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n" - ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n" - ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n" + ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n" + ".inst 0xc1a0ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n" + ".inst 0xc1b4ceac // sclamp { z12.s-z15.s }, z21.s, z20.s\n" "st1b { z12.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 24f\n" @@ -435,52 +434,52 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin "and x20, x20, #0x3\n" "cbz x21, 26f\n" "25:" // Store to output array: Accumulator row 3 loop - ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n" - ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" "add x12, x12, #0x4\n" - ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n" + ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" "cmp x12, x21, LSL #2\n" - ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n" - ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n" - "st1b { z20.s }, p0, [x26]\n" + ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" + ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1b { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z21.s }, p0, [x26]\n" + "st1b { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z22.s }, p0, [x26]\n" + "st1b { z30.s }, p0, [x26]\n" "add x26, x26, x24\n" - "st1b { z23.s }, p0, [x26]\n" + "st1b { z31.s }, p0, [x26]\n" "add x26, x26, x24\n" "blt 25b\n" "26:" // Store to output array: Accumulator row 3 oddments "cbz x20, 27f\n" - ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n" - ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n" + ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n" + ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n" "subs x20, x20, #0x1\n" - ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n" - ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n" - ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n" - "st1b { z0.s }, p0, [x26]\n" + ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n" + ".inst 0xc1a0ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n" + ".inst 0xc1b4cebc // sclamp { z28.s-z31.s }, z21.s, z20.s\n" + "st1b { z28.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 27f\n" "subs x20, x20, #0x1\n" - "st1b { z1.s }, p0, [x26]\n" + "st1b { z29.s }, p0, [x26]\n" "add x26, x26, x24\n" "beq 27f\n" - "st1b { z2.s }, p0, [x26]\n" + "st1b { z30.s }, p0, [x26]\n" "27:" // Store to output array: Accumulator row 3 oddments: End "28:" // Store to output array: End "tbz x16, #0, 30f\n" "mov x12, #0x0\n" "cntw x20\n" "29:" // Store to output array: Refill accumulators: Loop - ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n" - ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa040c1fc // ld1w { z28.s-z31.s }, pn8.b/Z, [x15]\n" + ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n" ".inst 0xa041c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x4, MUL VL]\n" ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n" ".inst 0xa042c1f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x15, #0x8, MUL VL]\n" ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n" - ".inst 0xa043c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" - ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n" + ".inst 0xa043c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15, #0xc, MUL VL]\n" + ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n" "add x12, x12, #0x4\n" "cmp x12, x20\n" "addvl x15, x15, #16\n" @@ -504,4 +503,3 @@ void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uin } // namespace arm_gemm #endif // ARM_COMPUTE_ENABLE_SME2 -#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp index e07fa549f3..1ce169d562 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp index 13f2e488dd..9136e32567 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp @@ -157,16 +157,16 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 6f\n" "4:" // Height 1: no bias "tbz %x[flags], #0, 5f\n" - "ld1w { z9.s }, p4/Z, [x13]\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x13]\n" + "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n" + "zip1 z8.d, z16.d, z12.d\n" + "zip2 z12.d, z16.d, z12.d\n" + "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 6f\n" @@ -184,11 +184,11 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -200,43 +200,43 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "ble 11f\n" "10:" // Height 1: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" + "ld1rqh { z20.h }, p0/Z, [x26]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n" + ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n" + ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n" + ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n" + ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n" "add x26, x26, #0x10\n" "addvl x12, x12, #4\n" "addvl x11, x11, #4\n" @@ -246,46 +246,46 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "11:" // Height 1: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" "ble 12f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n" + ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n" + ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n" + ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n" + ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" "addvl x10, x10, #2\n" @@ -301,17 +301,17 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "uzp1 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 13f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z21.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z21.s\n" + "fmin z9.s, p5/M, z9.s, z21.s\n" + "fmin z10.s, p5/M, z10.s, z21.s\n" + "fmin z11.s, p5/M, z11.s, z21.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "13:" // Height 1: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -376,21 +376,21 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "18:" // Height 2: no bias "tbz %x[flags], #0, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x13]\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" + "add x20, x13, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x13]\n" + "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 20f\n" @@ -408,12 +408,12 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "21:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 22f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -421,50 +421,50 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 23f\n" "22:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "23:" // Height 2: input setup done "cmp x27, #0x8\n" "ble 25f\n" "24:" // Height 2: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" + "ld1rqh { z20.h }, p0/Z, [x26]\n" + "ld1rqh { z19.h }, p0/Z, [x25]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n" + ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n" + ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n" + ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n" + ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "addvl x12, x12, #4\n" @@ -475,47 +475,47 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "25:" // Height 2: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + "ld1rqh { z19.h }, p0/Z, [x25]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" "subs x27, x27, #0x4\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" "ble 26f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n" + ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x11]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n" + ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n" + ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n" + "ld1h { z22.h }, p5/Z, [x9]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n" + ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" "addvl x10, x10, #2\n" @@ -537,25 +537,25 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "uzp2 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z7.s, p5/M, z7.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "27:" // Height 2: No activation "st1w { z7.s }, p4, [x13]\n" "st1w { z12.s }, p3, [x13, #1, MUL VL]\n" @@ -632,28 +632,28 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "32:" // Height 3: no bias "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x13]\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" + "add x21, x13, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x13]\n" + "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x20]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" @@ -685,13 +685,13 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "35:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -700,145 +700,145 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 37f\n" "36:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "37:" // Height 3: input setup done "cmp x27, #0x8\n" "ble 39f\n" "38:" // Height 3: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z30.h }, p0/Z, [x26]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "ld1rqh { z28.h }, p0/Z, [x24]\n" + "trn1 z27.d, z30.d, z24.d\n" + "trn2 z30.d, z30.d, z24.d\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "trn1 z26.d, z28.d, z29.d\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "trn2 z28.d, z28.d, z29.d\n" + ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "cmp x27, #0x8\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" + ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" + ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n" + ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n" "addvl x12, x12, #4\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n" + ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n" + ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n" + ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n" + ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n" + ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n" + ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n" + ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n" + ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n" "bgt 38b\n" "39:" // Height 3: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + "trn1 z27.d, z1.d, z24.d\n" + "trn2 z1.d, z1.d, z24.d\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "trn1 z26.d, z3.d, z28.d\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "trn2 z3.d, z3.d, z28.d\n" + ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "ble 40f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n" + ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n" + ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n" + ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n" + ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n" + ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n" + ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "addvl x10, x10, #2\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n" + ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x9, x9, #2\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n" + ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n" + ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n" + ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n" "40:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -861,33 +861,33 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "uzp1 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 41f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmax z7.s, p5/M, z7.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" "41:" // Height 3: No activation "st1w { z7.s }, p4, [x13]\n" "st1w { z12.s }, p3, [x13, #1, MUL VL]\n" @@ -968,37 +968,37 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "46:" // Height 4: no bias "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" + "add x22, x13, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x13]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x21]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" @@ -1026,14 +1026,14 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "49:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1043,149 +1043,149 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 51f\n" "50:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "51:" // Height 4: input setup done "cmp x27, #0x8\n" "ble 53f\n" "52:" // Height 4: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqh { z30.h }, p0/Z, [x26]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "trn1 z29.d, z30.d, z24.d\n" + "ld1rqh { z28.h }, p0/Z, [x24]\n" + "ld1rqh { z27.h }, p0/Z, [x23]\n" + "trn2 z30.d, z30.d, z24.d\n" + "trn1 z26.d, z28.d, z27.d\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "trn2 z28.d, z28.d, z27.d\n" + ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "cmp x27, #0x8\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" + ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x12, #3, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" + ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n" + ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x11, #2, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n" + ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n" "addvl x12, x12, #4\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n" + ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "addvl x11, x11, #4\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n" + ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n" + ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n" + ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n" + ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n" + ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n" + ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n" "bgt 52b\n" "53:" // Height 4: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "trn1 z28.d, z1.d, z24.d\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1rqh { z27.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z24.d\n" + "trn1 z26.d, z3.d, z27.d\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "trn2 z3.d, z3.d, z27.d\n" + ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" + ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "ble 54f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n" + ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n" + ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n" + ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x11]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n" + ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n" + ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n" + ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x9]\n" "addvl x10, x10, #2\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n" + ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x9, x9, #2\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n" + ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n" + ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n" + ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n" "54:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1213,41 +1213,41 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "uzp2 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 55f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z24.s\n" + "fmin z12.s, p5/M, z12.s, z24.s\n" + "fmin z13.s, p5/M, z13.s, z24.s\n" + "fmin z14.s, p5/M, z14.s, z24.s\n" + "fmin z8.s, p5/M, z8.s, z24.s\n" + "fmin z9.s, p5/M, z9.s, z24.s\n" + "fmin z10.s, p5/M, z10.s, z24.s\n" + "fmin z11.s, p5/M, z11.s, z24.s\n" + "fmin z15.s, p5/M, z15.s, z24.s\n" + "fmin z20.s, p5/M, z20.s, z24.s\n" + "fmin z21.s, p5/M, z21.s, z24.s\n" + "fmin z22.s, p5/M, z22.s, z24.s\n" + "fmin z16.s, p5/M, z16.s, z24.s\n" + "fmin z17.s, p5/M, z17.s, z24.s\n" + "fmin z18.s, p5/M, z18.s, z24.s\n" + "fmin z19.s, p5/M, z19.s, z24.s\n" + "fmax z7.s, p5/M, z7.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" "55:" // Height 4: No activation "st1w { z7.s }, p4, [x13]\n" "st1w { z12.s }, p3, [x13, #1, MUL VL]\n" @@ -1340,54 +1340,54 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "60:" // Height 5: no bias "tbz %x[flags], #0, 61f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x13, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x13]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x20]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z19.d, z24.d, z23.d\n" "zip2 z23.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z24.d, z25.d, z28.d\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 62f\n" "61:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1419,15 +1419,15 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "63:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 64f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 65f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1438,189 +1438,189 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 65f\n" "64:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "65:" // Height 5: input setup done "cmp x27, #0x8\n" "ble 67f\n" "66:" // Height 5: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1rqh { z5.h }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "ld1rqh { z6.h }, p0/Z, [x26]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z7.h }, p0/Z, [x24]\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn1 z5.d, z6.d, z1.d\n" + "trn2 z6.d, z6.d, z1.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "trn1 z3.d, z7.d, z2.d\n" + "trn2 z7.d, z7.d, z2.d\n" + "ld1h { z1.h }, p5/Z, [x12]\n" + "trn1 z2.d, z4.d, z0.d\n" + "trn2 z4.d, z4.d, z0.d\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n" + ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x11]\n" "sub x27, x27, #0x8\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" "add x25, x25, #0x10\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n" + ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x9]\n" + ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" "addvl x12, x12, #4\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n" + ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" "addvl x11, x11, #4\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" "addvl x10, x10, #4\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n" + ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" "addvl x9, x9, #4\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n" + ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "bgt 66b\n" "67:" // Height 5: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" + "ld1rqh { z4.h }, p0/Z, [x25]\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn1 z7.d, z1.d, z4.d\n" + "trn2 z1.d, z1.d, z4.d\n" "ld1rqh { z5.h }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "trn1 z6.d, z3.d, z2.d\n" + "trn2 z3.d, z3.d, z2.d\n" + "ld1h { z2.h }, p5/Z, [x12]\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n" + ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n" + ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x11]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" "addvl x12, x12, #2\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n" "addvl x11, x11, #2\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n" + ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n" "addvl x10, x10, #2\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x9]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n" "addvl x9, x9, #2\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n" + ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "ble 68f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "ld1h { z2.h }, p5/Z, [x12]\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n" + ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x11]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" + ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n" + ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n" + ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n" + ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" "addvl x10, x10, #2\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n" + ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x9]\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x9, x9, #2\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n" + ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n" + ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n" + ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "68:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1653,49 +1653,49 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "uzp1 z27.d, z27.d, z31.d\n" "tbz %x[flags], #1, 69f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" - "add x20, %x[args_ptr], %[offset_min]\n" "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" + "fmax z24.s, p5/M, z24.s, z23.s\n" + "fmax z25.s, p5/M, z25.s, z23.s\n" + "fmax z26.s, p5/M, z26.s, z23.s\n" + "fmax z27.s, p5/M, z27.s, z23.s\n" "69:" // Height 5: No activation "st1w { z7.s }, p4, [x13]\n" "st1w { z12.s }, p3, [x13, #1, MUL VL]\n" @@ -1795,59 +1795,59 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "74:" // Height 6: no bias "tbz %x[flags], #0, 75f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x13]\n" + "add x24, x13, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z17.s }, p4/Z, [x13]\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x13, #2, MUL VL]\n" "add x21, x22, x20, LSL #2\n" + "ld1w { z18.s }, p3/Z, [x13, #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x13, #2, MUL VL]\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z16.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip2 z12.d, z9.d, z12.d\n" - "zip1 z9.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z17.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "zip2 z12.d, z17.d, z12.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z20.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z20.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" "zip2 z15.d, z16.d, z15.d\n" "zip1 z16.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x21]\n" "zip2 z21.d, z18.d, z21.d\n" "zip1 z18.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" "zip2 z23.d, z24.d, z23.d\n" "zip1 z24.d, z25.d, z28.d\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 76f\n" "75:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1879,16 +1879,16 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "77:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 78f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 79f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1900,193 +1900,193 @@ void sve_ffhybrid_bf16fp32_mmla_6x4VL ( "b 79f\n" "78:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "79:" // Height 6: input setup done "cmp x27, #0x8\n" "ble 81f\n" "80:" // Height 6: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1rqh { z5.h }, p0/Z, [x22]\n" - "ld1rqh { z6.h }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "ld1rqh { z7.h }, p0/Z, [x26]\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" + "trn1 z6.d, z7.d, z0.d\n" + "ld1rqh { z5.h }, p0/Z, [x24]\n" + "ld1rqh { z1.h }, p0/Z, [x23]\n" + "trn2 z7.d, z7.d, z0.d\n" + "trn1 z4.d, z5.d, z1.d\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "ld1rqh { z0.h }, p0/Z, [x21]\n" + "trn2 z5.d, z5.d, z1.d\n" + "trn1 z2.d, z3.d, z0.d\n" + "trn2 z3.d, z3.d, z0.d\n" + "ld1h { z1.h }, p5/Z, [x12]\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" + ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x11]\n" "sub x27, x27, #0x8\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" "add x25, x25, #0x10\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" + ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" "add x21, x21, #0x10\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x12, #2, MUL VL]\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" + ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x9]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" + ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x12, #3, MUL VL]\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" "addvl x12, x12, #4\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" + ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n" + ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x11, #2, MUL VL]\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n" + ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" "addvl x11, x11, #4\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" + ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n" + ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n" + ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" "addvl x10, x10, #4\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #3, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" + ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n" + ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x9, #2, MUL VL]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n" + ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" "addvl x9, x9, #4\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n" + ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n" "bgt 80b\n" "81:" // Height 6: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" + "trn1 z7.d, z1.d, z0.d\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z0.d\n" + "trn1 z6.d, z3.d, z2.d\n" "ld1rqh { z5.h }, p0/Z, [x22]\n" - "ld1rqh { z6.h }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "ld1rqh { z0.h }, p0/Z, [x21]\n" + "trn2 z3.d, z3.d, z2.d\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1h { z2.h }, p5/Z, [x12]\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n" + ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n" + ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x11]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" "addvl x12, x12, #2\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n" "addvl x11, x11, #2\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n" + ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n" "addvl x10, x10, #2\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x9]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n" "addvl x9, x9, #2\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n" + ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "ble 82f\n" - "ld1h { z7.h }, p5/Z, [x12]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x11]\n" + "ld1h { z2.h }, p5/Z, [x12]\n" + "ld1h { z0.h }, p5/Z, [x12, #1, MUL VL]\n" + ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n" + ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x11]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x11, #1, MUL VL]\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" + ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n" + ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n" + ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n" + ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" "addvl x10, x10, #2\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x9, #1, MUL VL]\n" + ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n" + ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x9]\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" "addvl x9, x9, #2\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n" + ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n" + ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n" + ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "82:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp index acbc619eed..c42ad7e879 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp index 5f093bf08a..66601bd312 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp @@ -163,11 +163,11 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -183,12 +183,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "10:" // Height 1: Multiply loop: Main loop "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z17.h }, p4/Z, [x10]\n" + "ld1h { z16.h }, p4/Z, [x9]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" "add x26, x26, #0x2\n" "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -201,12 +201,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z17.h }, p4/Z, [x10]\n" + "ld1h { z16.h }, p4/Z, [x9]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" @@ -214,17 +214,17 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "bne 7b\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z17.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" + "ld1rh { z16.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z17.h\n" + "fmin z9.h, p4/M, z9.h, z17.h\n" + "fmin z10.h, p4/M, z10.h, z17.h\n" + "fmin z11.h, p4/M, z11.h, z17.h\n" + "fmax z8.h, p4/M, z8.h, z16.h\n" + "fmax z9.h, p4/M, z9.h, z16.h\n" + "fmax z10.h, p4/M, z10.h, z16.h\n" + "fmax z11.h, p4/M, z11.h, z16.h\n" "12:" // Height 1: No activation "st1h { z8.h }, p3, [x13]\n" "st1h { z9.h }, p2, [x13, #1, MUL VL]\n" @@ -285,15 +285,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "17:" // Height 2: no bias "tbz %x[flags], #0, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" + "add x20, x13, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x13]\n" "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x20]\n" + "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n" "b 19f\n" "18:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -309,12 +309,12 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "20:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 21f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 22f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -322,7 +322,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "b 22f\n" "21:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "22:" // Height 2: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -333,19 +333,19 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "23:" // Height 2: Multiply loop: Main loop "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z12.h, p4/M, z6.h, z1.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z17.h }, p4/Z, [x10]\n" "addvl x12, x12, #1\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z16.h }, p4/Z, [x9]\n" "addvl x11, x11, #1\n" "add x26, x26, #0x2\n" "subs x27, x27, #0x1\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z14.h, p4/M, z17.h, z1.h\n" "add x25, x25, #0x2\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" + "fmla z15.h, p4/M, z16.h, z1.h\n" "addvl x10, x10, #1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1rh { z1.h }, p4/Z, [x25]\n" @@ -357,18 +357,18 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z12.h, p4/M, z6.h, z1.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z17.h }, p4/Z, [x10]\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z16.h }, p4/Z, [x9]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z14.h, p4/M, z17.h, z1.h\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" + "fmla z15.h, p4/M, z16.h, z1.h\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "bne 20b\n" @@ -376,25 +376,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x25, x13, x20, LSL #1\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z17.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" + "ld1rh { z16.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z17.h\n" + "fmin z9.h, p4/M, z9.h, z17.h\n" + "fmin z10.h, p4/M, z10.h, z17.h\n" + "fmin z11.h, p4/M, z11.h, z17.h\n" + "fmin z12.h, p4/M, z12.h, z17.h\n" + "fmin z13.h, p4/M, z13.h, z17.h\n" + "fmin z14.h, p4/M, z14.h, z17.h\n" + "fmin z15.h, p4/M, z15.h, z17.h\n" + "fmax z8.h, p4/M, z8.h, z16.h\n" + "fmax z9.h, p4/M, z9.h, z16.h\n" + "fmax z10.h, p4/M, z10.h, z16.h\n" + "fmax z11.h, p4/M, z11.h, z16.h\n" + "fmax z12.h, p4/M, z12.h, z16.h\n" + "fmax z13.h, p4/M, z13.h, z16.h\n" + "fmax z14.h, p4/M, z14.h, z16.h\n" + "fmax z15.h, p4/M, z15.h, z16.h\n" "25:" // Height 2: No activation "st1h { z8.h }, p3, [x13]\n" "st1h { z9.h }, p2, [x13, #1, MUL VL]\n" @@ -463,20 +463,20 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "30:" // Height 3: no bias "tbz %x[flags], #0, 31f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x21, x13, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x13]\n" "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x21]\n" + "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x20]\n" + "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n" "b 32f\n" "31:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -496,13 +496,13 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "33:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 34f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -511,8 +511,8 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "b 35f\n" "34:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "35:" // Height 3: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -528,22 +528,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z21.h }, p4/Z, [x10]\n" "add x26, x26, #0x2\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z20.h }, p4/Z, [x9]\n" "subs x27, x27, #0x1\n" "add x25, x25, #0x2\n" "add x24, x24, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z21.h, z0.h\n" + "fmla z14.h, p4/M, z21.h, z1.h\n" + "fmla z18.h, p4/M, z21.h, z2.h\n" + "fmla z11.h, p4/M, z20.h, z0.h\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z15.h, p4/M, z20.h, z1.h\n" + "fmla z19.h, p4/M, z20.h, z2.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1rh { z1.h }, p4/Z, [x25]\n" "ld1rh { z2.h }, p4/Z, [x24]\n" @@ -557,54 +557,54 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z21.h }, p4/Z, [x10]\n" "cmp x28, x20\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z20.h }, p4/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z21.h, z0.h\n" + "fmla z14.h, p4/M, z21.h, z1.h\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z18.h, p4/M, z21.h, z2.h\n" + "fmla z11.h, p4/M, z20.h, z0.h\n" "addvl x9, x9, #1\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z15.h, p4/M, z20.h, z1.h\n" + "fmla z19.h, p4/M, z20.h, z2.h\n" "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #1\n" "add x24, x25, x20, LSL #1\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z21.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" + "ld1rh { z20.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z21.h\n" + "fmin z9.h, p4/M, z9.h, z21.h\n" + "fmin z10.h, p4/M, z10.h, z21.h\n" + "fmin z11.h, p4/M, z11.h, z21.h\n" + "fmin z12.h, p4/M, z12.h, z21.h\n" + "fmin z13.h, p4/M, z13.h, z21.h\n" + "fmin z14.h, p4/M, z14.h, z21.h\n" + "fmin z15.h, p4/M, z15.h, z21.h\n" + "fmin z16.h, p4/M, z16.h, z21.h\n" + "fmin z17.h, p4/M, z17.h, z21.h\n" + "fmin z18.h, p4/M, z18.h, z21.h\n" + "fmin z19.h, p4/M, z19.h, z21.h\n" + "fmax z8.h, p4/M, z8.h, z20.h\n" + "fmax z9.h, p4/M, z9.h, z20.h\n" + "fmax z10.h, p4/M, z10.h, z20.h\n" + "fmax z11.h, p4/M, z11.h, z20.h\n" + "fmax z12.h, p4/M, z12.h, z20.h\n" + "fmax z13.h, p4/M, z13.h, z20.h\n" + "fmax z14.h, p4/M, z14.h, z20.h\n" + "fmax z15.h, p4/M, z15.h, z20.h\n" + "fmax z16.h, p4/M, z16.h, z20.h\n" + "fmax z17.h, p4/M, z17.h, z20.h\n" + "fmax z18.h, p4/M, z18.h, z20.h\n" + "fmax z19.h, p4/M, z19.h, z20.h\n" "38:" // Height 3: No activation "st1h { z8.h }, p3, [x13]\n" "st1h { z9.h }, p2, [x13, #1, MUL VL]\n" @@ -681,25 +681,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "43:" // Height 4: no bias "tbz %x[flags], #0, 44f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x22, x13, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x13]\n" - "add x23, x24, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x22]\n" + "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x21]\n" + "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x20]\n" + "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n" "b 45f\n" "44:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -723,14 +723,14 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -740,9 +740,9 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "b 48f\n" "47:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "48:" // Height 4: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -759,7 +759,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z20.h, p4/M, z6.h, z3.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z25.h }, p4/Z, [x10]\n" "add x26, x26, #0x2\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" @@ -767,22 +767,22 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x25, x25, #0x2\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "fmla z21.h, p4/M, z7.h, z3.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z24.h }, p4/Z, [x9]\n" "add x24, x24, #0x2\n" "add x23, x23, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z25.h, z0.h\n" + "fmla z14.h, p4/M, z25.h, z1.h\n" "addvl x10, x10, #1\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z18.h, p4/M, z25.h, z2.h\n" + "fmla z22.h, p4/M, z25.h, z3.h\n" "addvl x9, x9, #1\n" "ld1h { z6.h }, p4/Z, [x12]\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z11.h, p4/M, z24.h, z0.h\n" + "fmla z15.h, p4/M, z24.h, z1.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1rh { z1.h }, p4/Z, [x25]\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z19.h, p4/M, z24.h, z2.h\n" + "fmla z23.h, p4/M, z24.h, z3.h\n" "ld1rh { z2.h }, p4/Z, [x24]\n" "ld1rh { z3.h }, p4/Z, [x23]\n" "ld1h { z7.h }, p4/Z, [x11]\n" @@ -794,7 +794,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z20.h, p4/M, z6.h, z3.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z25.h }, p4/Z, [x10]\n" "cmp x28, x20\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" @@ -802,17 +802,17 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "fmla z21.h, p4/M, z7.h, z3.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z24.h }, p4/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z25.h, z0.h\n" + "fmla z14.h, p4/M, z25.h, z1.h\n" "addvl x9, x9, #1\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z18.h, p4/M, z25.h, z2.h\n" + "fmla z22.h, p4/M, z25.h, z3.h\n" + "fmla z11.h, p4/M, z24.h, z0.h\n" + "fmla z15.h, p4/M, z24.h, z1.h\n" + "fmla z19.h, p4/M, z24.h, z2.h\n" + "fmla z23.h, p4/M, z24.h, z3.h\n" "bne 46b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #1\n" @@ -820,41 +820,41 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x23, x24, x20, LSL #1\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z25.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmin z20.h, p4/M, z20.h, z1.h\n" - "fmin z21.h, p4/M, z21.h, z1.h\n" - "fmin z22.h, p4/M, z22.h, z1.h\n" - "fmin z23.h, p4/M, z23.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" - "fmax z20.h, p4/M, z20.h, z0.h\n" - "fmax z21.h, p4/M, z21.h, z0.h\n" - "fmax z22.h, p4/M, z22.h, z0.h\n" - "fmax z23.h, p4/M, z23.h, z0.h\n" + "ld1rh { z24.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z25.h\n" + "fmin z9.h, p4/M, z9.h, z25.h\n" + "fmin z10.h, p4/M, z10.h, z25.h\n" + "fmin z11.h, p4/M, z11.h, z25.h\n" + "fmin z12.h, p4/M, z12.h, z25.h\n" + "fmin z13.h, p4/M, z13.h, z25.h\n" + "fmin z14.h, p4/M, z14.h, z25.h\n" + "fmin z15.h, p4/M, z15.h, z25.h\n" + "fmin z16.h, p4/M, z16.h, z25.h\n" + "fmin z17.h, p4/M, z17.h, z25.h\n" + "fmin z18.h, p4/M, z18.h, z25.h\n" + "fmin z19.h, p4/M, z19.h, z25.h\n" + "fmin z20.h, p4/M, z20.h, z25.h\n" + "fmin z21.h, p4/M, z21.h, z25.h\n" + "fmin z22.h, p4/M, z22.h, z25.h\n" + "fmin z23.h, p4/M, z23.h, z25.h\n" + "fmax z8.h, p4/M, z8.h, z24.h\n" + "fmax z9.h, p4/M, z9.h, z24.h\n" + "fmax z10.h, p4/M, z10.h, z24.h\n" + "fmax z11.h, p4/M, z11.h, z24.h\n" + "fmax z12.h, p4/M, z12.h, z24.h\n" + "fmax z13.h, p4/M, z13.h, z24.h\n" + "fmax z14.h, p4/M, z14.h, z24.h\n" + "fmax z15.h, p4/M, z15.h, z24.h\n" + "fmax z16.h, p4/M, z16.h, z24.h\n" + "fmax z17.h, p4/M, z17.h, z24.h\n" + "fmax z18.h, p4/M, z18.h, z24.h\n" + "fmax z19.h, p4/M, z19.h, z24.h\n" + "fmax z20.h, p4/M, z20.h, z24.h\n" + "fmax z21.h, p4/M, z21.h, z24.h\n" + "fmax z22.h, p4/M, z22.h, z24.h\n" + "fmax z23.h, p4/M, z23.h, z24.h\n" "51:" // Height 4: No activation "st1h { z8.h }, p3, [x13]\n" "st1h { z9.h }, p2, [x13, #1, MUL VL]\n" @@ -939,30 +939,30 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "56:" // Height 5: no bias "tbz %x[flags], #0, 57f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p3/Z, [x13]\n" - "add x23, x24, x20, LSL #1\n" + "add x23, x13, x20, LSL #1\n" "add x22, x23, x20, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x13]\n" + "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p3/Z, [x22]\n" - "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x23]\n" + "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x22]\n" + "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x21]\n" + "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x20]\n" + "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n" "b 58f\n" "57:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -990,15 +990,15 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "59:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 60f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 61f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1009,10 +1009,10 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "b 61f\n" "60:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "61:" // Height 5: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -1034,7 +1034,7 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z24.h, p4/M, z6.h, z4.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z29.h }, p4/Z, [x10]\n" "add x25, x25, #0x2\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" @@ -1042,24 +1042,24 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x23, x23, #0x2\n" "fmla z21.h, p4/M, z7.h, z3.h\n" "fmla z25.h, p4/M, z7.h, z4.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z28.h }, p4/Z, [x9]\n" "add x22, x22, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z29.h, z0.h\n" + "fmla z14.h, p4/M, z29.h, z1.h\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z26.h, p4/M, z6.h, z4.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z18.h, p4/M, z29.h, z2.h\n" + "fmla z22.h, p4/M, z29.h, z3.h\n" + "fmla z26.h, p4/M, z29.h, z4.h\n" + "fmla z11.h, p4/M, z28.h, z0.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1h { z6.h }, p4/Z, [x12]\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z15.h, p4/M, z28.h, z1.h\n" + "fmla z19.h, p4/M, z28.h, z2.h\n" "ld1rh { z1.h }, p4/Z, [x25]\n" "ld1rh { z2.h }, p4/Z, [x24]\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" - "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z23.h, p4/M, z28.h, z3.h\n" + "fmla z27.h, p4/M, z28.h, z4.h\n" "ld1rh { z3.h }, p4/Z, [x23]\n" "ld1rh { z4.h }, p4/Z, [x22]\n" "ld1h { z7.h }, p4/Z, [x11]\n" @@ -1075,25 +1075,25 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "addvl x12, x12, #1\n" "fmla z24.h, p4/M, z6.h, z4.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10]\n" + "ld1h { z29.h }, p4/Z, [x10]\n" "addvl x11, x11, #1\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "addvl x10, x10, #1\n" "fmla z21.h, p4/M, z7.h, z3.h\n" "fmla z25.h, p4/M, z7.h, z4.h\n" - "ld1h { z7.h }, p4/Z, [x9]\n" + "ld1h { z28.h }, p4/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z26.h, p4/M, z6.h, z4.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" - "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z10.h, p4/M, z29.h, z0.h\n" + "fmla z14.h, p4/M, z29.h, z1.h\n" + "fmla z18.h, p4/M, z29.h, z2.h\n" + "fmla z22.h, p4/M, z29.h, z3.h\n" + "fmla z26.h, p4/M, z29.h, z4.h\n" + "fmla z11.h, p4/M, z28.h, z0.h\n" + "fmla z15.h, p4/M, z28.h, z1.h\n" + "fmla z19.h, p4/M, z28.h, z2.h\n" + "fmla z23.h, p4/M, z28.h, z3.h\n" + "fmla z27.h, p4/M, z28.h, z4.h\n" "bne 59b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #1\n" @@ -1102,49 +1102,49 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "add x22, x23, x20, LSL #1\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z29.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmin z20.h, p4/M, z20.h, z1.h\n" - "fmin z21.h, p4/M, z21.h, z1.h\n" - "fmin z22.h, p4/M, z22.h, z1.h\n" - "fmin z23.h, p4/M, z23.h, z1.h\n" - "fmin z24.h, p4/M, z24.h, z1.h\n" - "fmin z25.h, p4/M, z25.h, z1.h\n" - "fmin z26.h, p4/M, z26.h, z1.h\n" - "fmin z27.h, p4/M, z27.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" - "fmax z20.h, p4/M, z20.h, z0.h\n" - "fmax z21.h, p4/M, z21.h, z0.h\n" - "fmax z22.h, p4/M, z22.h, z0.h\n" - "fmax z23.h, p4/M, z23.h, z0.h\n" - "fmax z24.h, p4/M, z24.h, z0.h\n" - "fmax z25.h, p4/M, z25.h, z0.h\n" - "fmax z26.h, p4/M, z26.h, z0.h\n" - "fmax z27.h, p4/M, z27.h, z0.h\n" + "ld1rh { z28.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z29.h\n" + "fmin z9.h, p4/M, z9.h, z29.h\n" + "fmin z10.h, p4/M, z10.h, z29.h\n" + "fmin z11.h, p4/M, z11.h, z29.h\n" + "fmin z12.h, p4/M, z12.h, z29.h\n" + "fmin z13.h, p4/M, z13.h, z29.h\n" + "fmin z14.h, p4/M, z14.h, z29.h\n" + "fmin z15.h, p4/M, z15.h, z29.h\n" + "fmin z16.h, p4/M, z16.h, z29.h\n" + "fmin z17.h, p4/M, z17.h, z29.h\n" + "fmin z18.h, p4/M, z18.h, z29.h\n" + "fmin z19.h, p4/M, z19.h, z29.h\n" + "fmin z20.h, p4/M, z20.h, z29.h\n" + "fmin z21.h, p4/M, z21.h, z29.h\n" + "fmin z22.h, p4/M, z22.h, z29.h\n" + "fmin z23.h, p4/M, z23.h, z29.h\n" + "fmin z24.h, p4/M, z24.h, z29.h\n" + "fmin z25.h, p4/M, z25.h, z29.h\n" + "fmin z26.h, p4/M, z26.h, z29.h\n" + "fmin z27.h, p4/M, z27.h, z29.h\n" + "fmax z8.h, p4/M, z8.h, z28.h\n" + "fmax z9.h, p4/M, z9.h, z28.h\n" + "fmax z10.h, p4/M, z10.h, z28.h\n" + "fmax z11.h, p4/M, z11.h, z28.h\n" + "fmax z12.h, p4/M, z12.h, z28.h\n" + "fmax z13.h, p4/M, z13.h, z28.h\n" + "fmax z14.h, p4/M, z14.h, z28.h\n" + "fmax z15.h, p4/M, z15.h, z28.h\n" + "fmax z16.h, p4/M, z16.h, z28.h\n" + "fmax z17.h, p4/M, z17.h, z28.h\n" + "fmax z18.h, p4/M, z18.h, z28.h\n" + "fmax z19.h, p4/M, z19.h, z28.h\n" + "fmax z20.h, p4/M, z20.h, z28.h\n" + "fmax z21.h, p4/M, z21.h, z28.h\n" + "fmax z22.h, p4/M, z22.h, z28.h\n" + "fmax z23.h, p4/M, z23.h, z28.h\n" + "fmax z24.h, p4/M, z24.h, z28.h\n" + "fmax z25.h, p4/M, z25.h, z28.h\n" + "fmax z26.h, p4/M, z26.h, z28.h\n" + "fmax z27.h, p4/M, z27.h, z28.h\n" "64:" // Height 5: No activation "st1h { z8.h }, p3, [x13]\n" "st1h { z9.h }, p2, [x13, #1, MUL VL]\n" @@ -1240,35 +1240,35 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "69:" // Height 6: no bias "tbz %x[flags], #0, 70f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p3/Z, [x13]\n" + "add x24, x13, x20, LSL #1\n" "add x23, x24, x20, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x13]\n" "add x22, x23, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x13, #2, MUL VL]\n" - "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z11.h }, p0/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p3/Z, [x22]\n" - "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n" - "ld1h { z28.h }, p3/Z, [x21]\n" - "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n" - "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n" - "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x22]\n" + "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x21]\n" + "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p3/Z, [x20]\n" + "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n" "b 71f\n" "70:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1300,16 +1300,16 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "72:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 74f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1321,11 +1321,11 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( "b 74f\n" "73:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "74:" // Height 6: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -1527,4 +1527,4 @@ void sve_ffhybrid_fp16_mla_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp index 0b543b667f..842db1a4fc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp @@ -163,11 +163,11 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -180,72 +180,72 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "10:" // Height 1: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x12]\n" + "fmla z8.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10]\n" + "fmla z10.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z11.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[7]\n" + "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n" "sub x27, x27, #0x8\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n" "cmp x27, #0x8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" "add x26, x26, #0x10\n" "addvl x12, x12, #8\n" "addvl x11, x11, #8\n" @@ -255,112 +255,112 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "11:" // Height 1: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x12]\n" + "fmla z8.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z10.h, z17.h, z0.h[0]\n" + "fmla z11.h, z16.h, z0.h[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[1]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z10.h, z17.h, z0.h[1]\n" + "fmla z11.h, z16.h, z0.h[1]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[2]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z10.h, z17.h, z0.h[2]\n" + "fmla z11.h, z16.h, z0.h[2]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[3]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z10.h, z17.h, z0.h[3]\n" + "fmla z11.h, z16.h, z0.h[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[4]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z10.h, z17.h, z0.h[4]\n" + "fmla z11.h, z16.h, z0.h[4]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[5]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z10.h, z17.h, z0.h[5]\n" + "fmla z11.h, z16.h, z0.h[5]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[6]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z10.h, z17.h, z0.h[6]\n" + "fmla z11.h, z16.h, z0.h[6]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[7]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" @@ -372,17 +372,17 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "bne 7b\n" "tbz %x[flags], #1, 13f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z17.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" + "ld1rh { z16.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z17.h\n" + "fmin z9.h, p5/M, z9.h, z17.h\n" + "fmin z10.h, p5/M, z10.h, z17.h\n" + "fmin z11.h, p5/M, z11.h, z17.h\n" + "fmax z8.h, p5/M, z8.h, z16.h\n" + "fmax z9.h, p5/M, z9.h, z16.h\n" + "fmax z10.h, p5/M, z10.h, z16.h\n" + "fmax z11.h, p5/M, z11.h, z16.h\n" "13:" // Height 1: No activation "st1h { z8.h }, p4, [x13]\n" "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" @@ -443,15 +443,15 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "18:" // Height 2: no bias "tbz %x[flags], #0, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" + "add x20, x13, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x13]\n" "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x20]\n" + "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n" "b 20f\n" "19:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -467,12 +467,12 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "21:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 22f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -480,263 +480,263 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "b 23f\n" "22:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "23:" // Height 2: input setup done "cmp x27, #0x8\n" "ble 25f\n" "24:" // Height 2: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z1.h }, p0/Z, [x26]\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z1.h[0]\n" + "fmla z12.h, z17.h, z0.h[0]\n" + "fmla z9.h, z16.h, z1.h[0]\n" + "fmla z13.h, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z10.h, z17.h, z1.h[0]\n" + "fmla z14.h, z17.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x12, #1, MUL VL]\n" "cmp x27, #0x8\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[0]\n" + "fmla z15.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x11, #1, MUL VL]\n" "add x26, x26, #0x10\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[1]\n" + "fmla z12.h, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #1, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[1]\n" + "fmla z13.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[1]\n" + "fmla z14.h, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[1]\n" + "fmla z15.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[2]\n" + "fmla z12.h, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[2]\n" + "fmla z13.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[2]\n" + "fmla z14.h, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[2]\n" + "fmla z15.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[3]\n" + "fmla z12.h, z17.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[3]\n" + "fmla z13.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[3]\n" + "fmla z14.h, z17.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[3]\n" + "fmla z15.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[4]\n" + "fmla z12.h, z17.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[4]\n" + "fmla z13.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[4]\n" + "fmla z14.h, z17.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[4]\n" + "fmla z15.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[5]\n" + "fmla z12.h, z17.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[5]\n" + "fmla z13.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[5]\n" + "fmla z14.h, z17.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[5]\n" + "fmla z15.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[6]\n" + "fmla z12.h, z17.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[6]\n" + "fmla z13.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[6]\n" + "fmla z14.h, z17.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x12, #7, MUL VL]\n" "addvl x12, x12, #8\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[6]\n" + "fmla z15.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x11, #7, MUL VL]\n" "addvl x11, x11, #8\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[7]\n" + "fmla z12.h, z17.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[7]\n" + "fmla z13.h, z16.h, z0.h[7]\n" + "ld1h { z16.h }, p5/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z10.h, z17.h, z1.h[7]\n" + "fmla z14.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z1.h[7]\n" + "fmla z15.h, z16.h, z0.h[7]\n" "bgt 24b\n" "25:" // Height 2: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[0]\n" + "fmla z12.h, z17.h, z1.h[0]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "fmla z13.h, z16.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z10.h, z17.h, z0.h[0]\n" + "fmla z14.h, z17.h, z1.h[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z11.h, z16.h, z0.h[0]\n" + "fmla z15.h, z16.h, z1.h[0]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[1]\n" + "fmla z12.h, z17.h, z1.h[1]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "fmla z13.h, z16.h, z1.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z10.h, z17.h, z0.h[1]\n" + "fmla z14.h, z17.h, z1.h[1]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z11.h, z16.h, z0.h[1]\n" + "fmla z15.h, z16.h, z1.h[1]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[2]\n" + "fmla z12.h, z17.h, z1.h[2]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "fmla z13.h, z16.h, z1.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z10.h, z17.h, z0.h[2]\n" + "fmla z14.h, z17.h, z1.h[2]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z11.h, z16.h, z0.h[2]\n" + "fmla z15.h, z16.h, z1.h[2]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[3]\n" + "fmla z12.h, z17.h, z1.h[3]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "fmla z13.h, z16.h, z1.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z10.h, z17.h, z0.h[3]\n" + "fmla z14.h, z17.h, z1.h[3]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z11.h, z16.h, z0.h[3]\n" + "fmla z15.h, z16.h, z1.h[3]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[4]\n" + "fmla z12.h, z17.h, z1.h[4]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "fmla z13.h, z16.h, z1.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z10.h, z17.h, z0.h[4]\n" + "fmla z14.h, z17.h, z1.h[4]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z11.h, z16.h, z0.h[4]\n" + "fmla z15.h, z16.h, z1.h[4]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[5]\n" + "fmla z12.h, z17.h, z1.h[5]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "fmla z13.h, z16.h, z1.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z10.h, z17.h, z0.h[5]\n" + "fmla z14.h, z17.h, z1.h[5]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z11.h, z16.h, z0.h[5]\n" + "fmla z15.h, z16.h, z1.h[5]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[6]\n" + "fmla z12.h, z17.h, z1.h[6]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "fmla z13.h, z16.h, z1.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z10.h, z17.h, z0.h[6]\n" + "fmla z14.h, z17.h, z1.h[6]\n" "addvl x12, x12, #1\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z11.h, z16.h, z0.h[6]\n" + "fmla z15.h, z16.h, z1.h[6]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" + "ld1h { z17.h }, p5/Z, [x12]\n" + "ld1h { z16.h }, p5/Z, [x11]\n" + "fmla z8.h, z17.h, z0.h[7]\n" + "fmla z12.h, z17.h, z1.h[7]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "fmla z13.h, z16.h, z1.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x9]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z14.h, z17.h, z1.h[7]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" + "fmla z15.h, z16.h, z1.h[7]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "26:" // Height 2: Multiply loop: multiply skip @@ -748,25 +748,25 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "add x25, x13, x20, LSL #1\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z17.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" + "ld1rh { z16.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z17.h\n" + "fmin z9.h, p5/M, z9.h, z17.h\n" + "fmin z10.h, p5/M, z10.h, z17.h\n" + "fmin z11.h, p5/M, z11.h, z17.h\n" + "fmin z12.h, p5/M, z12.h, z17.h\n" + "fmin z13.h, p5/M, z13.h, z17.h\n" + "fmin z14.h, p5/M, z14.h, z17.h\n" + "fmin z15.h, p5/M, z15.h, z17.h\n" + "fmax z8.h, p5/M, z8.h, z16.h\n" + "fmax z9.h, p5/M, z9.h, z16.h\n" + "fmax z10.h, p5/M, z10.h, z16.h\n" + "fmax z11.h, p5/M, z11.h, z16.h\n" + "fmax z12.h, p5/M, z12.h, z16.h\n" + "fmax z13.h, p5/M, z13.h, z16.h\n" + "fmax z14.h, p5/M, z14.h, z16.h\n" + "fmax z15.h, p5/M, z15.h, z16.h\n" "27:" // Height 2: No activation "st1h { z8.h }, p4, [x13]\n" "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" @@ -835,20 +835,20 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "32:" // Height 3: no bias "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x21, x13, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x13]\n" "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x21]\n" + "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x20]\n" + "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n" "b 34f\n" "33:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -868,13 +868,13 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "35:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -883,153 +883,153 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "b 37f\n" "36:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "37:" // Height 3: input setup done "cmp x27, #0x8\n" "ble 39f\n" "38:" // Height 3: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1rqh { z0.h }, p0/Z, [x24]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "fmla z8.h, z21.h, z2.h[0]\n" + "fmla z12.h, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z16.h, z21.h, z0.h[0]\n" + "fmla z9.h, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "fmla z13.h, z20.h, z1.h[0]\n" + "fmla z17.h, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "cmp x27, #0x8\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z10.h, z21.h, z2.h[0]\n" + "fmla z14.h, z21.h, z1.h[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z18.h, z21.h, z0.h[0]\n" + "fmla z11.h, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x12, #1, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[0]\n" + "fmla z19.h, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[1]\n" + "fmla z12.h, z21.h, z1.h[1]\n" + "fmla z16.h, z21.h, z0.h[1]\n" + "fmla z9.h, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[1]\n" + "fmla z17.h, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[1]\n" + "fmla z14.h, z21.h, z1.h[1]\n" + "fmla z18.h, z21.h, z0.h[1]\n" + "fmla z11.h, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[1]\n" + "fmla z19.h, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[2]\n" + "fmla z12.h, z21.h, z1.h[2]\n" + "fmla z16.h, z21.h, z0.h[2]\n" + "fmla z9.h, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[2]\n" + "fmla z17.h, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[2]\n" + "fmla z14.h, z21.h, z1.h[2]\n" + "fmla z18.h, z21.h, z0.h[2]\n" + "fmla z11.h, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[2]\n" + "fmla z19.h, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[3]\n" + "fmla z12.h, z21.h, z1.h[3]\n" + "fmla z16.h, z21.h, z0.h[3]\n" + "fmla z9.h, z20.h, z2.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[3]\n" + "fmla z17.h, z20.h, z0.h[3]\n" + "ld1h { z20.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[3]\n" + "fmla z14.h, z21.h, z1.h[3]\n" + "fmla z18.h, z21.h, z0.h[3]\n" + "fmla z11.h, z20.h, z2.h[3]\n" + "ld1h { z21.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[3]\n" + "fmla z19.h, z20.h, z0.h[3]\n" + "ld1h { z20.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[4]\n" + "fmla z12.h, z21.h, z1.h[4]\n" + "fmla z16.h, z21.h, z0.h[4]\n" + "fmla z9.h, z20.h, z2.h[4]\n" + "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[4]\n" + "fmla z17.h, z20.h, z0.h[4]\n" + "ld1h { z20.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[4]\n" + "fmla z14.h, z21.h, z1.h[4]\n" + "fmla z18.h, z21.h, z0.h[4]\n" + "fmla z11.h, z20.h, z2.h[4]\n" + "ld1h { z21.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[4]\n" + "fmla z19.h, z20.h, z0.h[4]\n" + "ld1h { z20.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[5]\n" + "fmla z12.h, z21.h, z1.h[5]\n" + "fmla z16.h, z21.h, z0.h[5]\n" + "fmla z9.h, z20.h, z2.h[5]\n" + "ld1h { z21.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[5]\n" + "fmla z17.h, z20.h, z0.h[5]\n" + "ld1h { z20.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[5]\n" + "fmla z14.h, z21.h, z1.h[5]\n" + "fmla z18.h, z21.h, z0.h[5]\n" + "fmla z11.h, z20.h, z2.h[5]\n" + "ld1h { z21.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[5]\n" + "fmla z19.h, z20.h, z0.h[5]\n" + "ld1h { z20.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[6]\n" + "fmla z12.h, z21.h, z1.h[6]\n" + "fmla z16.h, z21.h, z0.h[6]\n" + "fmla z9.h, z20.h, z2.h[6]\n" + "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[6]\n" + "fmla z17.h, z20.h, z0.h[6]\n" + "ld1h { z20.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[6]\n" + "fmla z14.h, z21.h, z1.h[6]\n" + "fmla z18.h, z21.h, z0.h[6]\n" + "fmla z11.h, z20.h, z2.h[6]\n" + "ld1h { z21.h }, p5/Z, [x12, #7, MUL VL]\n" "addvl x12, x12, #8\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[6]\n" + "fmla z19.h, z20.h, z0.h[6]\n" + "ld1h { z20.h }, p5/Z, [x11, #7, MUL VL]\n" "addvl x11, x11, #8\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[7]\n" + "fmla z12.h, z21.h, z1.h[7]\n" + "fmla z16.h, z21.h, z0.h[7]\n" + "fmla z9.h, z20.h, z2.h[7]\n" + "ld1h { z21.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[7]\n" + "fmla z17.h, z20.h, z0.h[7]\n" + "ld1h { z20.h }, p5/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z10.h, z21.h, z2.h[7]\n" + "fmla z14.h, z21.h, z1.h[7]\n" + "fmla z18.h, z21.h, z0.h[7]\n" + "fmla z11.h, z20.h, z2.h[7]\n" + "fmla z15.h, z20.h, z1.h[7]\n" + "fmla z19.h, z20.h, z0.h[7]\n" "bgt 38b\n" "39:" // Height 3: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1037,179 +1037,179 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "fmla z8.h, z21.h, z0.h[0]\n" + "fmla z12.h, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z16.h, z21.h, z2.h[0]\n" + "fmla z9.h, z20.h, z0.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "fmla z13.h, z20.h, z1.h[0]\n" + "fmla z17.h, z20.h, z2.h[0]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z10.h, z21.h, z0.h[0]\n" + "fmla z14.h, z21.h, z1.h[0]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" + "fmla z18.h, z21.h, z2.h[0]\n" + "fmla z11.h, z20.h, z0.h[0]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z15.h, z20.h, z1.h[0]\n" + "fmla z19.h, z20.h, z2.h[0]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[1]\n" + "fmla z12.h, z21.h, z1.h[1]\n" + "fmla z16.h, z21.h, z2.h[1]\n" + "fmla z9.h, z20.h, z0.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[1]\n" + "fmla z17.h, z20.h, z2.h[1]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z10.h, z21.h, z0.h[1]\n" + "fmla z14.h, z21.h, z1.h[1]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z18.h, z21.h, z2.h[1]\n" + "fmla z11.h, z20.h, z0.h[1]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z15.h, z20.h, z1.h[1]\n" + "fmla z19.h, z20.h, z2.h[1]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[2]\n" + "fmla z12.h, z21.h, z1.h[2]\n" + "fmla z16.h, z21.h, z2.h[2]\n" + "fmla z9.h, z20.h, z0.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[2]\n" + "fmla z17.h, z20.h, z2.h[2]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z10.h, z21.h, z0.h[2]\n" + "fmla z14.h, z21.h, z1.h[2]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z18.h, z21.h, z2.h[2]\n" + "fmla z11.h, z20.h, z0.h[2]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z15.h, z20.h, z1.h[2]\n" + "fmla z19.h, z20.h, z2.h[2]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[3]\n" + "fmla z12.h, z21.h, z1.h[3]\n" + "fmla z16.h, z21.h, z2.h[3]\n" + "fmla z9.h, z20.h, z0.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[3]\n" + "fmla z17.h, z20.h, z2.h[3]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z10.h, z21.h, z0.h[3]\n" + "fmla z14.h, z21.h, z1.h[3]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z18.h, z21.h, z2.h[3]\n" + "fmla z11.h, z20.h, z0.h[3]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z15.h, z20.h, z1.h[3]\n" + "fmla z19.h, z20.h, z2.h[3]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[4]\n" + "fmla z12.h, z21.h, z1.h[4]\n" + "fmla z16.h, z21.h, z2.h[4]\n" + "fmla z9.h, z20.h, z0.h[4]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[4]\n" + "fmla z17.h, z20.h, z2.h[4]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z10.h, z21.h, z0.h[4]\n" + "fmla z14.h, z21.h, z1.h[4]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z18.h, z21.h, z2.h[4]\n" + "fmla z11.h, z20.h, z0.h[4]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z15.h, z20.h, z1.h[4]\n" + "fmla z19.h, z20.h, z2.h[4]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[5]\n" + "fmla z12.h, z21.h, z1.h[5]\n" + "fmla z16.h, z21.h, z2.h[5]\n" + "fmla z9.h, z20.h, z0.h[5]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[5]\n" + "fmla z17.h, z20.h, z2.h[5]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z10.h, z21.h, z0.h[5]\n" + "fmla z14.h, z21.h, z1.h[5]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z18.h, z21.h, z2.h[5]\n" + "fmla z11.h, z20.h, z0.h[5]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z15.h, z20.h, z1.h[5]\n" + "fmla z19.h, z20.h, z2.h[5]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[6]\n" + "fmla z12.h, z21.h, z1.h[6]\n" + "fmla z16.h, z21.h, z2.h[6]\n" + "fmla z9.h, z20.h, z0.h[6]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[6]\n" + "fmla z17.h, z20.h, z2.h[6]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z10.h, z21.h, z0.h[6]\n" + "fmla z14.h, z21.h, z1.h[6]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z18.h, z21.h, z2.h[6]\n" + "fmla z11.h, z20.h, z0.h[6]\n" "addvl x9, x9, #1\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z15.h, z20.h, z1.h[6]\n" + "fmla z19.h, z20.h, z2.h[6]\n" "ble 40f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z21.h }, p5/Z, [x12]\n" + "ld1h { z20.h }, p5/Z, [x11]\n" + "fmla z8.h, z21.h, z0.h[7]\n" + "fmla z12.h, z21.h, z1.h[7]\n" + "fmla z16.h, z21.h, z2.h[7]\n" + "fmla z9.h, z20.h, z0.h[7]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z20.h, z1.h[7]\n" + "fmla z17.h, z20.h, z2.h[7]\n" + "ld1h { z20.h }, p5/Z, [x9]\n" "addvl x11, x11, #1\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" + "fmla z10.h, z21.h, z0.h[7]\n" + "fmla z14.h, z21.h, z1.h[7]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z18.h, z21.h, z2.h[7]\n" + "fmla z11.h, z20.h, z0.h[7]\n" + "fmla z15.h, z20.h, z1.h[7]\n" + "fmla z19.h, z20.h, z2.h[7]\n" "40:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1220,33 +1220,33 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "add x24, x25, x20, LSL #1\n" "tbz %x[flags], #1, 41f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z21.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" + "ld1rh { z20.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z21.h\n" + "fmin z9.h, p5/M, z9.h, z21.h\n" + "fmin z10.h, p5/M, z10.h, z21.h\n" + "fmin z11.h, p5/M, z11.h, z21.h\n" + "fmin z12.h, p5/M, z12.h, z21.h\n" + "fmin z13.h, p5/M, z13.h, z21.h\n" + "fmin z14.h, p5/M, z14.h, z21.h\n" + "fmin z15.h, p5/M, z15.h, z21.h\n" + "fmin z16.h, p5/M, z16.h, z21.h\n" + "fmin z17.h, p5/M, z17.h, z21.h\n" + "fmin z18.h, p5/M, z18.h, z21.h\n" + "fmin z19.h, p5/M, z19.h, z21.h\n" + "fmax z8.h, p5/M, z8.h, z20.h\n" + "fmax z9.h, p5/M, z9.h, z20.h\n" + "fmax z10.h, p5/M, z10.h, z20.h\n" + "fmax z11.h, p5/M, z11.h, z20.h\n" + "fmax z12.h, p5/M, z12.h, z20.h\n" + "fmax z13.h, p5/M, z13.h, z20.h\n" + "fmax z14.h, p5/M, z14.h, z20.h\n" + "fmax z15.h, p5/M, z15.h, z20.h\n" + "fmax z16.h, p5/M, z16.h, z20.h\n" + "fmax z17.h, p5/M, z17.h, z20.h\n" + "fmax z18.h, p5/M, z18.h, z20.h\n" + "fmax z19.h, p5/M, z19.h, z20.h\n" "41:" // Height 3: No activation "st1h { z8.h }, p4, [x13]\n" "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" @@ -1323,25 +1323,25 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "46:" // Height 4: no bias "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x22, x13, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x22]\n" + "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x21]\n" + "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x20]\n" + "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n" "b 48f\n" "47:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -1365,14 +1365,14 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "49:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1382,188 +1382,188 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "b 51f\n" "50:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "51:" // Height 4: input setup done "cmp x27, #0x8\n" "ble 53f\n" "52:" // Height 4: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z3.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "ld1rqh { z0.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z3.h[0]\n" + "fmla z12.h, z25.h, z2.h[0]\n" + "fmla z16.h, z25.h, z1.h[0]\n" + "fmla z20.h, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "add x25, x25, #0x10\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" + "fmla z9.h, z24.h, z3.h[0]\n" + "fmla z13.h, z24.h, z2.h[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z17.h, z24.h, z1.h[0]\n" + "fmla z21.h, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" + "fmla z10.h, z25.h, z3.h[0]\n" + "fmla z14.h, z25.h, z2.h[0]\n" + "fmla z18.h, z25.h, z1.h[0]\n" + "fmla z22.h, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[0]\n" + "fmla z15.h, z24.h, z2.h[0]\n" + "fmla z19.h, z24.h, z1.h[0]\n" + "fmla z23.h, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[1]\n" + "fmla z12.h, z25.h, z2.h[1]\n" + "fmla z16.h, z25.h, z1.h[1]\n" + "fmla z20.h, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[1]\n" + "fmla z13.h, z24.h, z2.h[1]\n" + "fmla z17.h, z24.h, z1.h[1]\n" + "fmla z21.h, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[1]\n" + "fmla z14.h, z25.h, z2.h[1]\n" + "fmla z18.h, z25.h, z1.h[1]\n" + "fmla z22.h, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[1]\n" + "fmla z15.h, z24.h, z2.h[1]\n" + "fmla z19.h, z24.h, z1.h[1]\n" + "fmla z23.h, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[2]\n" + "fmla z12.h, z25.h, z2.h[2]\n" + "fmla z16.h, z25.h, z1.h[2]\n" + "fmla z20.h, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[2]\n" + "fmla z13.h, z24.h, z2.h[2]\n" + "fmla z17.h, z24.h, z1.h[2]\n" + "fmla z21.h, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[2]\n" + "fmla z14.h, z25.h, z2.h[2]\n" + "fmla z18.h, z25.h, z1.h[2]\n" + "fmla z22.h, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[2]\n" + "fmla z15.h, z24.h, z2.h[2]\n" + "fmla z19.h, z24.h, z1.h[2]\n" + "fmla z23.h, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[3]\n" + "fmla z12.h, z25.h, z2.h[3]\n" + "fmla z16.h, z25.h, z1.h[3]\n" + "fmla z20.h, z25.h, z0.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[3]\n" + "fmla z13.h, z24.h, z2.h[3]\n" + "fmla z17.h, z24.h, z1.h[3]\n" + "fmla z21.h, z24.h, z0.h[3]\n" + "ld1h { z24.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[3]\n" + "fmla z14.h, z25.h, z2.h[3]\n" + "fmla z18.h, z25.h, z1.h[3]\n" + "fmla z22.h, z25.h, z0.h[3]\n" + "ld1h { z25.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[3]\n" + "fmla z15.h, z24.h, z2.h[3]\n" + "fmla z19.h, z24.h, z1.h[3]\n" + "fmla z23.h, z24.h, z0.h[3]\n" + "ld1h { z24.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[4]\n" + "fmla z12.h, z25.h, z2.h[4]\n" + "fmla z16.h, z25.h, z1.h[4]\n" + "fmla z20.h, z25.h, z0.h[4]\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[4]\n" + "fmla z13.h, z24.h, z2.h[4]\n" + "fmla z17.h, z24.h, z1.h[4]\n" + "fmla z21.h, z24.h, z0.h[4]\n" + "ld1h { z24.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[4]\n" + "fmla z14.h, z25.h, z2.h[4]\n" + "fmla z18.h, z25.h, z1.h[4]\n" + "fmla z22.h, z25.h, z0.h[4]\n" + "ld1h { z25.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[4]\n" + "fmla z15.h, z24.h, z2.h[4]\n" + "fmla z19.h, z24.h, z1.h[4]\n" + "fmla z23.h, z24.h, z0.h[4]\n" + "ld1h { z24.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[5]\n" + "fmla z12.h, z25.h, z2.h[5]\n" + "fmla z16.h, z25.h, z1.h[5]\n" + "fmla z20.h, z25.h, z0.h[5]\n" + "ld1h { z25.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[5]\n" + "fmla z13.h, z24.h, z2.h[5]\n" + "fmla z17.h, z24.h, z1.h[5]\n" + "fmla z21.h, z24.h, z0.h[5]\n" + "ld1h { z24.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[5]\n" + "fmla z14.h, z25.h, z2.h[5]\n" + "fmla z18.h, z25.h, z1.h[5]\n" + "fmla z22.h, z25.h, z0.h[5]\n" + "ld1h { z25.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[5]\n" + "fmla z15.h, z24.h, z2.h[5]\n" + "fmla z19.h, z24.h, z1.h[5]\n" + "fmla z23.h, z24.h, z0.h[5]\n" + "ld1h { z24.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[6]\n" + "fmla z12.h, z25.h, z2.h[6]\n" + "fmla z16.h, z25.h, z1.h[6]\n" + "fmla z20.h, z25.h, z0.h[6]\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[6]\n" + "fmla z13.h, z24.h, z2.h[6]\n" + "fmla z17.h, z24.h, z1.h[6]\n" + "fmla z21.h, z24.h, z0.h[6]\n" + "ld1h { z24.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[6]\n" + "fmla z14.h, z25.h, z2.h[6]\n" + "fmla z18.h, z25.h, z1.h[6]\n" + "fmla z22.h, z25.h, z0.h[6]\n" + "ld1h { z25.h }, p5/Z, [x12, #7, MUL VL]\n" "addvl x12, x12, #8\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[6]\n" + "fmla z15.h, z24.h, z2.h[6]\n" + "fmla z19.h, z24.h, z1.h[6]\n" + "fmla z23.h, z24.h, z0.h[6]\n" + "ld1h { z24.h }, p5/Z, [x11, #7, MUL VL]\n" "addvl x11, x11, #8\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[7]\n" + "fmla z12.h, z25.h, z2.h[7]\n" + "fmla z16.h, z25.h, z1.h[7]\n" + "fmla z20.h, z25.h, z0.h[7]\n" + "ld1h { z25.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[7]\n" + "fmla z13.h, z24.h, z2.h[7]\n" + "fmla z17.h, z24.h, z1.h[7]\n" + "fmla z21.h, z24.h, z0.h[7]\n" + "ld1h { z24.h }, p5/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z10.h, z25.h, z3.h[7]\n" + "fmla z14.h, z25.h, z2.h[7]\n" + "fmla z18.h, z25.h, z1.h[7]\n" + "fmla z22.h, z25.h, z0.h[7]\n" + "fmla z11.h, z24.h, z3.h[7]\n" + "fmla z15.h, z24.h, z2.h[7]\n" + "fmla z19.h, z24.h, z1.h[7]\n" + "fmla z23.h, z24.h, z0.h[7]\n" "bgt 52b\n" "53:" // Height 4: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1572,211 +1572,211 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "subs x27, x27, #0x1\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[0]\n" + "fmla z12.h, z25.h, z1.h[0]\n" + "fmla z16.h, z25.h, z2.h[0]\n" + "fmla z20.h, z25.h, z3.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" + "fmla z9.h, z24.h, z0.h[0]\n" + "fmla z13.h, z24.h, z1.h[0]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[0]\n" + "fmla z21.h, z24.h, z3.h[0]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z10.h, z25.h, z0.h[0]\n" + "fmla z14.h, z25.h, z1.h[0]\n" + "fmla z18.h, z25.h, z2.h[0]\n" + "fmla z22.h, z25.h, z3.h[0]\n" + "fmla z11.h, z24.h, z0.h[0]\n" + "fmla z15.h, z24.h, z1.h[0]\n" + "fmla z19.h, z24.h, z2.h[0]\n" + "fmla z23.h, z24.h, z3.h[0]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[1]\n" + "fmla z12.h, z25.h, z1.h[1]\n" + "fmla z16.h, z25.h, z2.h[1]\n" + "fmla z20.h, z25.h, z3.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z9.h, z24.h, z0.h[1]\n" + "fmla z13.h, z24.h, z1.h[1]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[1]\n" + "fmla z21.h, z24.h, z3.h[1]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z10.h, z25.h, z0.h[1]\n" + "fmla z14.h, z25.h, z1.h[1]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z18.h, z25.h, z2.h[1]\n" + "fmla z22.h, z25.h, z3.h[1]\n" + "fmla z11.h, z24.h, z0.h[1]\n" + "fmla z15.h, z24.h, z1.h[1]\n" + "fmla z19.h, z24.h, z2.h[1]\n" + "fmla z23.h, z24.h, z3.h[1]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[2]\n" + "fmla z12.h, z25.h, z1.h[2]\n" + "fmla z16.h, z25.h, z2.h[2]\n" + "fmla z20.h, z25.h, z3.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z9.h, z24.h, z0.h[2]\n" + "fmla z13.h, z24.h, z1.h[2]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[2]\n" + "fmla z21.h, z24.h, z3.h[2]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z10.h, z25.h, z0.h[2]\n" + "fmla z14.h, z25.h, z1.h[2]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z18.h, z25.h, z2.h[2]\n" + "fmla z22.h, z25.h, z3.h[2]\n" + "fmla z11.h, z24.h, z0.h[2]\n" + "fmla z15.h, z24.h, z1.h[2]\n" + "fmla z19.h, z24.h, z2.h[2]\n" + "fmla z23.h, z24.h, z3.h[2]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[3]\n" + "fmla z12.h, z25.h, z1.h[3]\n" + "fmla z16.h, z25.h, z2.h[3]\n" + "fmla z20.h, z25.h, z3.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z9.h, z24.h, z0.h[3]\n" + "fmla z13.h, z24.h, z1.h[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[3]\n" + "fmla z21.h, z24.h, z3.h[3]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z10.h, z25.h, z0.h[3]\n" + "fmla z14.h, z25.h, z1.h[3]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z18.h, z25.h, z2.h[3]\n" + "fmla z22.h, z25.h, z3.h[3]\n" + "fmla z11.h, z24.h, z0.h[3]\n" + "fmla z15.h, z24.h, z1.h[3]\n" + "fmla z19.h, z24.h, z2.h[3]\n" + "fmla z23.h, z24.h, z3.h[3]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[4]\n" + "fmla z12.h, z25.h, z1.h[4]\n" + "fmla z16.h, z25.h, z2.h[4]\n" + "fmla z20.h, z25.h, z3.h[4]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z9.h, z24.h, z0.h[4]\n" + "fmla z13.h, z24.h, z1.h[4]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[4]\n" + "fmla z21.h, z24.h, z3.h[4]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z10.h, z25.h, z0.h[4]\n" + "fmla z14.h, z25.h, z1.h[4]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z18.h, z25.h, z2.h[4]\n" + "fmla z22.h, z25.h, z3.h[4]\n" + "fmla z11.h, z24.h, z0.h[4]\n" + "fmla z15.h, z24.h, z1.h[4]\n" + "fmla z19.h, z24.h, z2.h[4]\n" + "fmla z23.h, z24.h, z3.h[4]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[5]\n" + "fmla z12.h, z25.h, z1.h[5]\n" + "fmla z16.h, z25.h, z2.h[5]\n" + "fmla z20.h, z25.h, z3.h[5]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z9.h, z24.h, z0.h[5]\n" + "fmla z13.h, z24.h, z1.h[5]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[5]\n" + "fmla z21.h, z24.h, z3.h[5]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z10.h, z25.h, z0.h[5]\n" + "fmla z14.h, z25.h, z1.h[5]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z18.h, z25.h, z2.h[5]\n" + "fmla z22.h, z25.h, z3.h[5]\n" + "fmla z11.h, z24.h, z0.h[5]\n" + "fmla z15.h, z24.h, z1.h[5]\n" + "fmla z19.h, z24.h, z2.h[5]\n" + "fmla z23.h, z24.h, z3.h[5]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[6]\n" + "fmla z12.h, z25.h, z1.h[6]\n" + "fmla z16.h, z25.h, z2.h[6]\n" + "fmla z20.h, z25.h, z3.h[6]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z9.h, z24.h, z0.h[6]\n" + "fmla z13.h, z24.h, z1.h[6]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[6]\n" + "fmla z21.h, z24.h, z3.h[6]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z10.h, z25.h, z0.h[6]\n" + "fmla z14.h, z25.h, z1.h[6]\n" "addvl x9, x9, #1\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z18.h, z25.h, z2.h[6]\n" + "fmla z22.h, z25.h, z3.h[6]\n" + "fmla z11.h, z24.h, z0.h[6]\n" + "fmla z15.h, z24.h, z1.h[6]\n" + "fmla z19.h, z24.h, z2.h[6]\n" + "fmla z23.h, z24.h, z3.h[6]\n" "ble 54f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "ld1h { z25.h }, p5/Z, [x12]\n" + "ld1h { z24.h }, p5/Z, [x11]\n" + "fmla z8.h, z25.h, z0.h[7]\n" + "fmla z12.h, z25.h, z1.h[7]\n" + "fmla z16.h, z25.h, z2.h[7]\n" + "fmla z20.h, z25.h, z3.h[7]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" + "fmla z9.h, z24.h, z0.h[7]\n" + "fmla z13.h, z24.h, z1.h[7]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z24.h, z2.h[7]\n" + "fmla z21.h, z24.h, z3.h[7]\n" + "ld1h { z24.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z10.h, z25.h, z0.h[7]\n" + "fmla z14.h, z25.h, z1.h[7]\n" + "fmla z18.h, z25.h, z2.h[7]\n" + "fmla z22.h, z25.h, z3.h[7]\n" + "fmla z11.h, z24.h, z0.h[7]\n" + "fmla z15.h, z24.h, z1.h[7]\n" + "fmla z19.h, z24.h, z2.h[7]\n" + "fmla z23.h, z24.h, z3.h[7]\n" "54:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1788,41 +1788,41 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "add x23, x24, x20, LSL #1\n" "tbz %x[flags], #1, 55f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z25.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmin z20.h, p5/M, z20.h, z1.h\n" - "fmin z21.h, p5/M, z21.h, z1.h\n" - "fmin z22.h, p5/M, z22.h, z1.h\n" - "fmin z23.h, p5/M, z23.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" - "fmax z20.h, p5/M, z20.h, z0.h\n" - "fmax z21.h, p5/M, z21.h, z0.h\n" - "fmax z22.h, p5/M, z22.h, z0.h\n" - "fmax z23.h, p5/M, z23.h, z0.h\n" + "ld1rh { z24.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z25.h\n" + "fmin z9.h, p5/M, z9.h, z25.h\n" + "fmin z10.h, p5/M, z10.h, z25.h\n" + "fmin z11.h, p5/M, z11.h, z25.h\n" + "fmin z12.h, p5/M, z12.h, z25.h\n" + "fmin z13.h, p5/M, z13.h, z25.h\n" + "fmin z14.h, p5/M, z14.h, z25.h\n" + "fmin z15.h, p5/M, z15.h, z25.h\n" + "fmin z16.h, p5/M, z16.h, z25.h\n" + "fmin z17.h, p5/M, z17.h, z25.h\n" + "fmin z18.h, p5/M, z18.h, z25.h\n" + "fmin z19.h, p5/M, z19.h, z25.h\n" + "fmin z20.h, p5/M, z20.h, z25.h\n" + "fmin z21.h, p5/M, z21.h, z25.h\n" + "fmin z22.h, p5/M, z22.h, z25.h\n" + "fmin z23.h, p5/M, z23.h, z25.h\n" + "fmax z8.h, p5/M, z8.h, z24.h\n" + "fmax z9.h, p5/M, z9.h, z24.h\n" + "fmax z10.h, p5/M, z10.h, z24.h\n" + "fmax z11.h, p5/M, z11.h, z24.h\n" + "fmax z12.h, p5/M, z12.h, z24.h\n" + "fmax z13.h, p5/M, z13.h, z24.h\n" + "fmax z14.h, p5/M, z14.h, z24.h\n" + "fmax z15.h, p5/M, z15.h, z24.h\n" + "fmax z16.h, p5/M, z16.h, z24.h\n" + "fmax z17.h, p5/M, z17.h, z24.h\n" + "fmax z18.h, p5/M, z18.h, z24.h\n" + "fmax z19.h, p5/M, z19.h, z24.h\n" + "fmax z20.h, p5/M, z20.h, z24.h\n" + "fmax z21.h, p5/M, z21.h, z24.h\n" + "fmax z22.h, p5/M, z22.h, z24.h\n" + "fmax z23.h, p5/M, z23.h, z24.h\n" "55:" // Height 4: No activation "st1h { z8.h }, p4, [x13]\n" "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" @@ -1907,30 +1907,30 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "60:" // Height 5: no bias "tbz %x[flags], #0, 61f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #1\n" + "add x23, x13, x20, LSL #1\n" "add x22, x23, x20, LSL #1\n" + "ld1h { z8.h }, p4/Z, [x13]\n" + "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x22]\n" - "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x23]\n" + "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x22]\n" + "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x21]\n" + "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x20]\n" + "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n" "b 62f\n" "61:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1958,15 +1958,15 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "63:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 64f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 65f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1977,223 +1977,223 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "b 65f\n" "64:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "65:" // Height 5: input setup done "cmp x27, #0x8\n" "ble 67f\n" "66:" // Height 5: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z4.h }, p0/Z, [x26]\n" + "ld1rqh { z3.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1rqh { z0.h }, p0/Z, [x22]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "fmla z8.h, z29.h, z4.h[0]\n" + "fmla z12.h, z29.h, z3.h[0]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z16.h, z29.h, z2.h[0]\n" + "fmla z20.h, z29.h, z1.h[0]\n" "add x25, x25, #0x10\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z0.h[0]\n" + "fmla z9.h, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "add x24, x24, #0x10\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z13.h, z28.h, z3.h[0]\n" + "fmla z17.h, z28.h, z2.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z21.h, z28.h, z1.h[0]\n" + "fmla z25.h, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" + "fmla z10.h, z29.h, z4.h[0]\n" + "fmla z14.h, z29.h, z3.h[0]\n" + "fmla z18.h, z29.h, z2.h[0]\n" + "fmla z22.h, z29.h, z1.h[0]\n" + "fmla z26.h, z29.h, z0.h[0]\n" + "fmla z11.h, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[0]\n" + "fmla z19.h, z28.h, z2.h[0]\n" + "fmla z23.h, z28.h, z1.h[0]\n" + "fmla z27.h, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[1]\n" + "fmla z12.h, z29.h, z3.h[1]\n" + "fmla z16.h, z29.h, z2.h[1]\n" + "fmla z20.h, z29.h, z1.h[1]\n" + "fmla z24.h, z29.h, z0.h[1]\n" + "fmla z9.h, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[1]\n" + "fmla z17.h, z28.h, z2.h[1]\n" + "fmla z21.h, z28.h, z1.h[1]\n" + "fmla z25.h, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[1]\n" + "fmla z14.h, z29.h, z3.h[1]\n" + "fmla z18.h, z29.h, z2.h[1]\n" + "fmla z22.h, z29.h, z1.h[1]\n" + "fmla z26.h, z29.h, z0.h[1]\n" + "fmla z11.h, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[1]\n" + "fmla z19.h, z28.h, z2.h[1]\n" + "fmla z23.h, z28.h, z1.h[1]\n" + "fmla z27.h, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[2]\n" + "fmla z12.h, z29.h, z3.h[2]\n" + "fmla z16.h, z29.h, z2.h[2]\n" + "fmla z20.h, z29.h, z1.h[2]\n" + "fmla z24.h, z29.h, z0.h[2]\n" + "fmla z9.h, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[2]\n" + "fmla z17.h, z28.h, z2.h[2]\n" + "fmla z21.h, z28.h, z1.h[2]\n" + "fmla z25.h, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[2]\n" + "fmla z14.h, z29.h, z3.h[2]\n" + "fmla z18.h, z29.h, z2.h[2]\n" + "fmla z22.h, z29.h, z1.h[2]\n" + "fmla z26.h, z29.h, z0.h[2]\n" + "fmla z11.h, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[2]\n" + "fmla z19.h, z28.h, z2.h[2]\n" + "fmla z23.h, z28.h, z1.h[2]\n" + "fmla z27.h, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[3]\n" + "fmla z12.h, z29.h, z3.h[3]\n" + "fmla z16.h, z29.h, z2.h[3]\n" + "fmla z20.h, z29.h, z1.h[3]\n" + "fmla z24.h, z29.h, z0.h[3]\n" + "fmla z9.h, z28.h, z4.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[3]\n" + "fmla z17.h, z28.h, z2.h[3]\n" + "fmla z21.h, z28.h, z1.h[3]\n" + "fmla z25.h, z28.h, z0.h[3]\n" + "ld1h { z28.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[3]\n" + "fmla z14.h, z29.h, z3.h[3]\n" + "fmla z18.h, z29.h, z2.h[3]\n" + "fmla z22.h, z29.h, z1.h[3]\n" + "fmla z26.h, z29.h, z0.h[3]\n" + "fmla z11.h, z28.h, z4.h[3]\n" + "ld1h { z29.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[3]\n" + "fmla z19.h, z28.h, z2.h[3]\n" + "fmla z23.h, z28.h, z1.h[3]\n" + "fmla z27.h, z28.h, z0.h[3]\n" + "ld1h { z28.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[4]\n" + "fmla z12.h, z29.h, z3.h[4]\n" + "fmla z16.h, z29.h, z2.h[4]\n" + "fmla z20.h, z29.h, z1.h[4]\n" + "fmla z24.h, z29.h, z0.h[4]\n" + "fmla z9.h, z28.h, z4.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[4]\n" + "fmla z17.h, z28.h, z2.h[4]\n" + "fmla z21.h, z28.h, z1.h[4]\n" + "fmla z25.h, z28.h, z0.h[4]\n" + "ld1h { z28.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[4]\n" + "fmla z14.h, z29.h, z3.h[4]\n" + "fmla z18.h, z29.h, z2.h[4]\n" + "fmla z22.h, z29.h, z1.h[4]\n" + "fmla z26.h, z29.h, z0.h[4]\n" + "fmla z11.h, z28.h, z4.h[4]\n" + "ld1h { z29.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[4]\n" + "fmla z19.h, z28.h, z2.h[4]\n" + "fmla z23.h, z28.h, z1.h[4]\n" + "fmla z27.h, z28.h, z0.h[4]\n" + "ld1h { z28.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[5]\n" + "fmla z12.h, z29.h, z3.h[5]\n" + "fmla z16.h, z29.h, z2.h[5]\n" + "fmla z20.h, z29.h, z1.h[5]\n" + "fmla z24.h, z29.h, z0.h[5]\n" + "fmla z9.h, z28.h, z4.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[5]\n" + "fmla z17.h, z28.h, z2.h[5]\n" + "fmla z21.h, z28.h, z1.h[5]\n" + "fmla z25.h, z28.h, z0.h[5]\n" + "ld1h { z28.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[5]\n" + "fmla z14.h, z29.h, z3.h[5]\n" + "fmla z18.h, z29.h, z2.h[5]\n" + "fmla z22.h, z29.h, z1.h[5]\n" + "fmla z26.h, z29.h, z0.h[5]\n" + "fmla z11.h, z28.h, z4.h[5]\n" + "ld1h { z29.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[5]\n" + "fmla z19.h, z28.h, z2.h[5]\n" + "fmla z23.h, z28.h, z1.h[5]\n" + "fmla z27.h, z28.h, z0.h[5]\n" + "ld1h { z28.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[6]\n" + "fmla z12.h, z29.h, z3.h[6]\n" + "fmla z16.h, z29.h, z2.h[6]\n" + "fmla z20.h, z29.h, z1.h[6]\n" + "fmla z24.h, z29.h, z0.h[6]\n" + "fmla z9.h, z28.h, z4.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[6]\n" + "fmla z17.h, z28.h, z2.h[6]\n" + "fmla z21.h, z28.h, z1.h[6]\n" + "fmla z25.h, z28.h, z0.h[6]\n" + "ld1h { z28.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[6]\n" + "fmla z14.h, z29.h, z3.h[6]\n" + "fmla z18.h, z29.h, z2.h[6]\n" + "fmla z22.h, z29.h, z1.h[6]\n" + "fmla z26.h, z29.h, z0.h[6]\n" + "fmla z11.h, z28.h, z4.h[6]\n" + "ld1h { z29.h }, p5/Z, [x12, #7, MUL VL]\n" "addvl x12, x12, #8\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[6]\n" + "fmla z19.h, z28.h, z2.h[6]\n" + "fmla z23.h, z28.h, z1.h[6]\n" + "fmla z27.h, z28.h, z0.h[6]\n" + "ld1h { z28.h }, p5/Z, [x11, #7, MUL VL]\n" "addvl x11, x11, #8\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[7]\n" + "fmla z12.h, z29.h, z3.h[7]\n" + "fmla z16.h, z29.h, z2.h[7]\n" + "fmla z20.h, z29.h, z1.h[7]\n" + "fmla z24.h, z29.h, z0.h[7]\n" + "fmla z9.h, z28.h, z4.h[7]\n" + "ld1h { z29.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[7]\n" + "fmla z17.h, z28.h, z2.h[7]\n" + "fmla z21.h, z28.h, z1.h[7]\n" + "fmla z25.h, z28.h, z0.h[7]\n" + "ld1h { z28.h }, p5/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z10.h, z29.h, z4.h[7]\n" + "fmla z14.h, z29.h, z3.h[7]\n" + "fmla z18.h, z29.h, z2.h[7]\n" + "fmla z22.h, z29.h, z1.h[7]\n" + "fmla z26.h, z29.h, z0.h[7]\n" + "fmla z11.h, z28.h, z4.h[7]\n" + "fmla z15.h, z28.h, z3.h[7]\n" + "fmla z19.h, z28.h, z2.h[7]\n" + "fmla z23.h, z28.h, z1.h[7]\n" + "fmla z27.h, z28.h, z0.h[7]\n" "bgt 66b\n" "67:" // Height 5: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -2203,243 +2203,243 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "fmla z8.h, z29.h, z0.h[0]\n" + "fmla z12.h, z29.h, z1.h[0]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z16.h, z29.h, z2.h[0]\n" + "fmla z20.h, z29.h, z3.h[0]\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[0]\n" + "fmla z9.h, z28.h, z0.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z13.h, z28.h, z1.h[0]\n" + "fmla z17.h, z28.h, z2.h[0]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[0]\n" + "fmla z25.h, z28.h, z4.h[0]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" + "fmla z10.h, z29.h, z0.h[0]\n" + "fmla z14.h, z29.h, z1.h[0]\n" + "fmla z18.h, z29.h, z2.h[0]\n" + "fmla z22.h, z29.h, z3.h[0]\n" + "fmla z26.h, z29.h, z4.h[0]\n" + "fmla z11.h, z28.h, z0.h[0]\n" + "fmla z15.h, z28.h, z1.h[0]\n" + "fmla z19.h, z28.h, z2.h[0]\n" + "fmla z23.h, z28.h, z3.h[0]\n" + "fmla z27.h, z28.h, z4.h[0]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[1]\n" + "fmla z12.h, z29.h, z1.h[1]\n" + "fmla z16.h, z29.h, z2.h[1]\n" + "fmla z20.h, z29.h, z3.h[1]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[1]\n" + "fmla z9.h, z28.h, z0.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" + "fmla z13.h, z28.h, z1.h[1]\n" + "fmla z17.h, z28.h, z2.h[1]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[1]\n" + "fmla z25.h, z28.h, z4.h[1]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" + "fmla z10.h, z29.h, z0.h[1]\n" + "fmla z14.h, z29.h, z1.h[1]\n" + "fmla z18.h, z29.h, z2.h[1]\n" + "fmla z22.h, z29.h, z3.h[1]\n" + "fmla z26.h, z29.h, z4.h[1]\n" + "fmla z11.h, z28.h, z0.h[1]\n" + "fmla z15.h, z28.h, z1.h[1]\n" + "fmla z19.h, z28.h, z2.h[1]\n" + "fmla z23.h, z28.h, z3.h[1]\n" + "fmla z27.h, z28.h, z4.h[1]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[2]\n" + "fmla z12.h, z29.h, z1.h[2]\n" + "fmla z16.h, z29.h, z2.h[2]\n" + "fmla z20.h, z29.h, z3.h[2]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[2]\n" + "fmla z9.h, z28.h, z0.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" + "fmla z13.h, z28.h, z1.h[2]\n" + "fmla z17.h, z28.h, z2.h[2]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[2]\n" + "fmla z25.h, z28.h, z4.h[2]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" + "fmla z10.h, z29.h, z0.h[2]\n" + "fmla z14.h, z29.h, z1.h[2]\n" + "fmla z18.h, z29.h, z2.h[2]\n" + "fmla z22.h, z29.h, z3.h[2]\n" + "fmla z26.h, z29.h, z4.h[2]\n" + "fmla z11.h, z28.h, z0.h[2]\n" + "fmla z15.h, z28.h, z1.h[2]\n" + "fmla z19.h, z28.h, z2.h[2]\n" + "fmla z23.h, z28.h, z3.h[2]\n" + "fmla z27.h, z28.h, z4.h[2]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[3]\n" + "fmla z12.h, z29.h, z1.h[3]\n" + "fmla z16.h, z29.h, z2.h[3]\n" + "fmla z20.h, z29.h, z3.h[3]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[3]\n" + "fmla z9.h, z28.h, z0.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" + "fmla z13.h, z28.h, z1.h[3]\n" + "fmla z17.h, z28.h, z2.h[3]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[3]\n" + "fmla z25.h, z28.h, z4.h[3]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" + "fmla z10.h, z29.h, z0.h[3]\n" + "fmla z14.h, z29.h, z1.h[3]\n" + "fmla z18.h, z29.h, z2.h[3]\n" + "fmla z22.h, z29.h, z3.h[3]\n" + "fmla z26.h, z29.h, z4.h[3]\n" + "fmla z11.h, z28.h, z0.h[3]\n" + "fmla z15.h, z28.h, z1.h[3]\n" + "fmla z19.h, z28.h, z2.h[3]\n" + "fmla z23.h, z28.h, z3.h[3]\n" + "fmla z27.h, z28.h, z4.h[3]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[4]\n" + "fmla z12.h, z29.h, z1.h[4]\n" + "fmla z16.h, z29.h, z2.h[4]\n" + "fmla z20.h, z29.h, z3.h[4]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[4]\n" + "fmla z9.h, z28.h, z0.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" + "fmla z13.h, z28.h, z1.h[4]\n" + "fmla z17.h, z28.h, z2.h[4]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[4]\n" + "fmla z25.h, z28.h, z4.h[4]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" + "fmla z10.h, z29.h, z0.h[4]\n" + "fmla z14.h, z29.h, z1.h[4]\n" + "fmla z18.h, z29.h, z2.h[4]\n" + "fmla z22.h, z29.h, z3.h[4]\n" + "fmla z26.h, z29.h, z4.h[4]\n" + "fmla z11.h, z28.h, z0.h[4]\n" + "fmla z15.h, z28.h, z1.h[4]\n" + "fmla z19.h, z28.h, z2.h[4]\n" + "fmla z23.h, z28.h, z3.h[4]\n" + "fmla z27.h, z28.h, z4.h[4]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[5]\n" + "fmla z12.h, z29.h, z1.h[5]\n" + "fmla z16.h, z29.h, z2.h[5]\n" + "fmla z20.h, z29.h, z3.h[5]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[5]\n" + "fmla z9.h, z28.h, z0.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" + "fmla z13.h, z28.h, z1.h[5]\n" + "fmla z17.h, z28.h, z2.h[5]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[5]\n" + "fmla z25.h, z28.h, z4.h[5]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" + "fmla z10.h, z29.h, z0.h[5]\n" + "fmla z14.h, z29.h, z1.h[5]\n" + "fmla z18.h, z29.h, z2.h[5]\n" + "fmla z22.h, z29.h, z3.h[5]\n" + "fmla z26.h, z29.h, z4.h[5]\n" + "fmla z11.h, z28.h, z0.h[5]\n" + "fmla z15.h, z28.h, z1.h[5]\n" + "fmla z19.h, z28.h, z2.h[5]\n" + "fmla z23.h, z28.h, z3.h[5]\n" + "fmla z27.h, z28.h, z4.h[5]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[6]\n" + "fmla z12.h, z29.h, z1.h[6]\n" + "fmla z16.h, z29.h, z2.h[6]\n" + "fmla z20.h, z29.h, z3.h[6]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[6]\n" + "fmla z9.h, z28.h, z0.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" + "fmla z13.h, z28.h, z1.h[6]\n" + "fmla z17.h, z28.h, z2.h[6]\n" "addvl x10, x10, #1\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z21.h, z28.h, z3.h[6]\n" + "fmla z25.h, z28.h, z4.h[6]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" + "fmla z10.h, z29.h, z0.h[6]\n" + "fmla z14.h, z29.h, z1.h[6]\n" + "fmla z18.h, z29.h, z2.h[6]\n" + "fmla z22.h, z29.h, z3.h[6]\n" + "fmla z26.h, z29.h, z4.h[6]\n" + "fmla z11.h, z28.h, z0.h[6]\n" + "fmla z15.h, z28.h, z1.h[6]\n" + "fmla z19.h, z28.h, z2.h[6]\n" + "fmla z23.h, z28.h, z3.h[6]\n" + "fmla z27.h, z28.h, z4.h[6]\n" "ble 68f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" + "ld1h { z29.h }, p5/Z, [x12]\n" + "ld1h { z28.h }, p5/Z, [x11]\n" + "fmla z8.h, z29.h, z0.h[7]\n" + "fmla z12.h, z29.h, z1.h[7]\n" + "fmla z16.h, z29.h, z2.h[7]\n" + "fmla z20.h, z29.h, z3.h[7]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z29.h, z4.h[7]\n" + "fmla z9.h, z28.h, z0.h[7]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z13.h, z28.h, z1.h[7]\n" + "fmla z17.h, z28.h, z2.h[7]\n" + "fmla z21.h, z28.h, z3.h[7]\n" + "fmla z25.h, z28.h, z4.h[7]\n" + "ld1h { z28.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z10.h, z29.h, z0.h[7]\n" + "fmla z14.h, z29.h, z1.h[7]\n" + "fmla z18.h, z29.h, z2.h[7]\n" + "fmla z22.h, z29.h, z3.h[7]\n" + "fmla z26.h, z29.h, z4.h[7]\n" + "fmla z11.h, z28.h, z0.h[7]\n" + "fmla z15.h, z28.h, z1.h[7]\n" + "fmla z19.h, z28.h, z2.h[7]\n" + "fmla z23.h, z28.h, z3.h[7]\n" + "fmla z27.h, z28.h, z4.h[7]\n" "68:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2452,49 +2452,49 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "add x22, x23, x20, LSL #1\n" "tbz %x[flags], #1, 69f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z29.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmin z20.h, p5/M, z20.h, z1.h\n" - "fmin z21.h, p5/M, z21.h, z1.h\n" - "fmin z22.h, p5/M, z22.h, z1.h\n" - "fmin z23.h, p5/M, z23.h, z1.h\n" - "fmin z24.h, p5/M, z24.h, z1.h\n" - "fmin z25.h, p5/M, z25.h, z1.h\n" - "fmin z26.h, p5/M, z26.h, z1.h\n" - "fmin z27.h, p5/M, z27.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" - "fmax z20.h, p5/M, z20.h, z0.h\n" - "fmax z21.h, p5/M, z21.h, z0.h\n" - "fmax z22.h, p5/M, z22.h, z0.h\n" - "fmax z23.h, p5/M, z23.h, z0.h\n" - "fmax z24.h, p5/M, z24.h, z0.h\n" - "fmax z25.h, p5/M, z25.h, z0.h\n" - "fmax z26.h, p5/M, z26.h, z0.h\n" - "fmax z27.h, p5/M, z27.h, z0.h\n" + "ld1rh { z28.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z29.h\n" + "fmin z9.h, p5/M, z9.h, z29.h\n" + "fmin z10.h, p5/M, z10.h, z29.h\n" + "fmin z11.h, p5/M, z11.h, z29.h\n" + "fmin z12.h, p5/M, z12.h, z29.h\n" + "fmin z13.h, p5/M, z13.h, z29.h\n" + "fmin z14.h, p5/M, z14.h, z29.h\n" + "fmin z15.h, p5/M, z15.h, z29.h\n" + "fmin z16.h, p5/M, z16.h, z29.h\n" + "fmin z17.h, p5/M, z17.h, z29.h\n" + "fmin z18.h, p5/M, z18.h, z29.h\n" + "fmin z19.h, p5/M, z19.h, z29.h\n" + "fmin z20.h, p5/M, z20.h, z29.h\n" + "fmin z21.h, p5/M, z21.h, z29.h\n" + "fmin z22.h, p5/M, z22.h, z29.h\n" + "fmin z23.h, p5/M, z23.h, z29.h\n" + "fmin z24.h, p5/M, z24.h, z29.h\n" + "fmin z25.h, p5/M, z25.h, z29.h\n" + "fmin z26.h, p5/M, z26.h, z29.h\n" + "fmin z27.h, p5/M, z27.h, z29.h\n" + "fmax z8.h, p5/M, z8.h, z28.h\n" + "fmax z9.h, p5/M, z9.h, z28.h\n" + "fmax z10.h, p5/M, z10.h, z28.h\n" + "fmax z11.h, p5/M, z11.h, z28.h\n" + "fmax z12.h, p5/M, z12.h, z28.h\n" + "fmax z13.h, p5/M, z13.h, z28.h\n" + "fmax z14.h, p5/M, z14.h, z28.h\n" + "fmax z15.h, p5/M, z15.h, z28.h\n" + "fmax z16.h, p5/M, z16.h, z28.h\n" + "fmax z17.h, p5/M, z17.h, z28.h\n" + "fmax z18.h, p5/M, z18.h, z28.h\n" + "fmax z19.h, p5/M, z19.h, z28.h\n" + "fmax z20.h, p5/M, z20.h, z28.h\n" + "fmax z21.h, p5/M, z21.h, z28.h\n" + "fmax z22.h, p5/M, z22.h, z28.h\n" + "fmax z23.h, p5/M, z23.h, z28.h\n" + "fmax z24.h, p5/M, z24.h, z28.h\n" + "fmax z25.h, p5/M, z25.h, z28.h\n" + "fmax z26.h, p5/M, z26.h, z28.h\n" + "fmax z27.h, p5/M, z27.h, z28.h\n" "69:" // Height 5: No activation "st1h { z8.h }, p4, [x13]\n" "st1h { z9.h }, p3, [x13, #1, MUL VL]\n" @@ -2590,35 +2590,35 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "74:" // Height 6: no bias "tbz %x[flags], #0, 75f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p4/Z, [x13]\n" + "add x24, x13, x20, LSL #1\n" "add x23, x24, x20, LSL #1\n" + "ld1h { z8.h }, p4/Z, [x13]\n" "add x22, x23, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n" - "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x22]\n" - "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n" - "ld1h { z28.h }, p4/Z, [x21]\n" - "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n" - "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n" - "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x22]\n" + "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x21]\n" + "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x20]\n" + "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n" "b 76f\n" "75:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -2650,16 +2650,16 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "77:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 78f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 79f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2671,258 +2671,258 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "b 79f\n" "78:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "79:" // Height 6: input setup done "cmp x27, #0x8\n" "ble 81f\n" "80:" // Height 6: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z7.h }, p0/Z, [x26]\n" + "ld1rqh { z6.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z5.h }, p0/Z, [x24]\n" + "ld1rqh { z4.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1rqh { z5.h }, p0/Z, [x21]\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "ld1rqh { z2.h }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z1.h }, p5/Z, [x12]\n" + "ld1h { z0.h }, p5/Z, [x11]\n" + "fmla z8.h, z1.h, z7.h[0]\n" + "fmla z12.h, z1.h, z6.h[0]\n" + "fmla z16.h, z1.h, z5.h[0]\n" + "fmla z20.h, z1.h, z4.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z1.h, z3.h[0]\n" + "fmla z28.h, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10]\n" "add x21, x21, #0x10\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z30.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "fmla z31.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z30.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "fmla z31.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z30.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "fmla z31.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z30.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x12, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "fmla z31.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x11, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9, #4, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z30.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x12, #5, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "fmla z31.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x11, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z30.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x12, #6, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "fmla z31.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x11, #6, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z30.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x12, #7, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[0]\n" + "fmla z13.h, z0.h, z6.h[0]\n" + "fmla z17.h, z0.h, z5.h[0]\n" + "fmla z21.h, z0.h, z4.h[0]\n" + "fmla z25.h, z0.h, z3.h[0]\n" + "fmla z29.h, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x9]\n" + "fmla z10.h, z1.h, z7.h[0]\n" + "fmla z14.h, z1.h, z6.h[0]\n" + "fmla z18.h, z1.h, z5.h[0]\n" + "fmla z22.h, z1.h, z4.h[0]\n" + "fmla z26.h, z1.h, z3.h[0]\n" + "fmla z30.h, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[0]\n" + "fmla z15.h, z0.h, z6.h[0]\n" + "fmla z19.h, z0.h, z5.h[0]\n" + "fmla z23.h, z0.h, z4.h[0]\n" + "fmla z27.h, z0.h, z3.h[0]\n" + "fmla z31.h, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[1]\n" + "fmla z12.h, z1.h, z6.h[1]\n" + "fmla z16.h, z1.h, z5.h[1]\n" + "fmla z20.h, z1.h, z4.h[1]\n" + "fmla z24.h, z1.h, z3.h[1]\n" + "fmla z28.h, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[1]\n" + "fmla z13.h, z0.h, z6.h[1]\n" + "fmla z17.h, z0.h, z5.h[1]\n" + "fmla z21.h, z0.h, z4.h[1]\n" + "fmla z25.h, z0.h, z3.h[1]\n" + "fmla z29.h, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[1]\n" + "fmla z14.h, z1.h, z6.h[1]\n" + "fmla z18.h, z1.h, z5.h[1]\n" + "fmla z22.h, z1.h, z4.h[1]\n" + "fmla z26.h, z1.h, z3.h[1]\n" + "fmla z30.h, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[1]\n" + "fmla z15.h, z0.h, z6.h[1]\n" + "fmla z19.h, z0.h, z5.h[1]\n" + "fmla z23.h, z0.h, z4.h[1]\n" + "fmla z27.h, z0.h, z3.h[1]\n" + "fmla z31.h, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[2]\n" + "fmla z12.h, z1.h, z6.h[2]\n" + "fmla z16.h, z1.h, z5.h[2]\n" + "fmla z20.h, z1.h, z4.h[2]\n" + "fmla z24.h, z1.h, z3.h[2]\n" + "fmla z28.h, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[2]\n" + "fmla z13.h, z0.h, z6.h[2]\n" + "fmla z17.h, z0.h, z5.h[2]\n" + "fmla z21.h, z0.h, z4.h[2]\n" + "fmla z25.h, z0.h, z3.h[2]\n" + "fmla z29.h, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[2]\n" + "fmla z14.h, z1.h, z6.h[2]\n" + "fmla z18.h, z1.h, z5.h[2]\n" + "fmla z22.h, z1.h, z4.h[2]\n" + "fmla z26.h, z1.h, z3.h[2]\n" + "fmla z30.h, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[2]\n" + "fmla z15.h, z0.h, z6.h[2]\n" + "fmla z19.h, z0.h, z5.h[2]\n" + "fmla z23.h, z0.h, z4.h[2]\n" + "fmla z27.h, z0.h, z3.h[2]\n" + "fmla z31.h, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[3]\n" + "fmla z12.h, z1.h, z6.h[3]\n" + "fmla z16.h, z1.h, z5.h[3]\n" + "fmla z20.h, z1.h, z4.h[3]\n" + "fmla z24.h, z1.h, z3.h[3]\n" + "fmla z28.h, z1.h, z2.h[3]\n" + "ld1h { z1.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[3]\n" + "fmla z13.h, z0.h, z6.h[3]\n" + "fmla z17.h, z0.h, z5.h[3]\n" + "fmla z21.h, z0.h, z4.h[3]\n" + "fmla z25.h, z0.h, z3.h[3]\n" + "fmla z29.h, z0.h, z2.h[3]\n" + "ld1h { z0.h }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[3]\n" + "fmla z14.h, z1.h, z6.h[3]\n" + "fmla z18.h, z1.h, z5.h[3]\n" + "fmla z22.h, z1.h, z4.h[3]\n" + "fmla z26.h, z1.h, z3.h[3]\n" + "fmla z30.h, z1.h, z2.h[3]\n" + "ld1h { z1.h }, p5/Z, [x12, #4, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[3]\n" + "fmla z15.h, z0.h, z6.h[3]\n" + "fmla z19.h, z0.h, z5.h[3]\n" + "fmla z23.h, z0.h, z4.h[3]\n" + "fmla z27.h, z0.h, z3.h[3]\n" + "fmla z31.h, z0.h, z2.h[3]\n" + "ld1h { z0.h }, p5/Z, [x11, #4, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[4]\n" + "fmla z12.h, z1.h, z6.h[4]\n" + "fmla z16.h, z1.h, z5.h[4]\n" + "fmla z20.h, z1.h, z4.h[4]\n" + "fmla z24.h, z1.h, z3.h[4]\n" + "fmla z28.h, z1.h, z2.h[4]\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[4]\n" + "fmla z13.h, z0.h, z6.h[4]\n" + "fmla z17.h, z0.h, z5.h[4]\n" + "fmla z21.h, z0.h, z4.h[4]\n" + "fmla z25.h, z0.h, z3.h[4]\n" + "fmla z29.h, z0.h, z2.h[4]\n" + "ld1h { z0.h }, p5/Z, [x9, #4, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[4]\n" + "fmla z14.h, z1.h, z6.h[4]\n" + "fmla z18.h, z1.h, z5.h[4]\n" + "fmla z22.h, z1.h, z4.h[4]\n" + "fmla z26.h, z1.h, z3.h[4]\n" + "fmla z30.h, z1.h, z2.h[4]\n" + "ld1h { z1.h }, p5/Z, [x12, #5, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[4]\n" + "fmla z15.h, z0.h, z6.h[4]\n" + "fmla z19.h, z0.h, z5.h[4]\n" + "fmla z23.h, z0.h, z4.h[4]\n" + "fmla z27.h, z0.h, z3.h[4]\n" + "fmla z31.h, z0.h, z2.h[4]\n" + "ld1h { z0.h }, p5/Z, [x11, #5, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[5]\n" + "fmla z12.h, z1.h, z6.h[5]\n" + "fmla z16.h, z1.h, z5.h[5]\n" + "fmla z20.h, z1.h, z4.h[5]\n" + "fmla z24.h, z1.h, z3.h[5]\n" + "fmla z28.h, z1.h, z2.h[5]\n" + "ld1h { z1.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[5]\n" + "fmla z13.h, z0.h, z6.h[5]\n" + "fmla z17.h, z0.h, z5.h[5]\n" + "fmla z21.h, z0.h, z4.h[5]\n" + "fmla z25.h, z0.h, z3.h[5]\n" + "fmla z29.h, z0.h, z2.h[5]\n" + "ld1h { z0.h }, p5/Z, [x9, #5, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[5]\n" + "fmla z14.h, z1.h, z6.h[5]\n" + "fmla z18.h, z1.h, z5.h[5]\n" + "fmla z22.h, z1.h, z4.h[5]\n" + "fmla z26.h, z1.h, z3.h[5]\n" + "fmla z30.h, z1.h, z2.h[5]\n" + "ld1h { z1.h }, p5/Z, [x12, #6, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[5]\n" + "fmla z15.h, z0.h, z6.h[5]\n" + "fmla z19.h, z0.h, z5.h[5]\n" + "fmla z23.h, z0.h, z4.h[5]\n" + "fmla z27.h, z0.h, z3.h[5]\n" + "fmla z31.h, z0.h, z2.h[5]\n" + "ld1h { z0.h }, p5/Z, [x11, #6, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[6]\n" + "fmla z12.h, z1.h, z6.h[6]\n" + "fmla z16.h, z1.h, z5.h[6]\n" + "fmla z20.h, z1.h, z4.h[6]\n" + "fmla z24.h, z1.h, z3.h[6]\n" + "fmla z28.h, z1.h, z2.h[6]\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[6]\n" + "fmla z13.h, z0.h, z6.h[6]\n" + "fmla z17.h, z0.h, z5.h[6]\n" + "fmla z21.h, z0.h, z4.h[6]\n" + "fmla z25.h, z0.h, z3.h[6]\n" + "fmla z29.h, z0.h, z2.h[6]\n" + "ld1h { z0.h }, p5/Z, [x9, #6, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[6]\n" + "fmla z14.h, z1.h, z6.h[6]\n" + "fmla z18.h, z1.h, z5.h[6]\n" + "fmla z22.h, z1.h, z4.h[6]\n" + "fmla z26.h, z1.h, z3.h[6]\n" + "fmla z30.h, z1.h, z2.h[6]\n" + "ld1h { z1.h }, p5/Z, [x12, #7, MUL VL]\n" "addvl x12, x12, #8\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "fmla z31.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x11, #7, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[6]\n" + "fmla z15.h, z0.h, z6.h[6]\n" + "fmla z19.h, z0.h, z5.h[6]\n" + "fmla z23.h, z0.h, z4.h[6]\n" + "fmla z27.h, z0.h, z3.h[6]\n" + "fmla z31.h, z0.h, z2.h[6]\n" + "ld1h { z0.h }, p5/Z, [x11, #7, MUL VL]\n" "addvl x11, x11, #8\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[7]\n" + "fmla z12.h, z1.h, z6.h[7]\n" + "fmla z16.h, z1.h, z5.h[7]\n" + "fmla z20.h, z1.h, z4.h[7]\n" + "fmla z24.h, z1.h, z3.h[7]\n" + "fmla z28.h, z1.h, z2.h[7]\n" + "ld1h { z1.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[7]\n" + "fmla z13.h, z0.h, z6.h[7]\n" + "fmla z17.h, z0.h, z5.h[7]\n" + "fmla z21.h, z0.h, z4.h[7]\n" + "fmla z25.h, z0.h, z3.h[7]\n" + "fmla z29.h, z0.h, z2.h[7]\n" + "ld1h { z0.h }, p5/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z30.h, z6.h, z5.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" - "fmla z31.h, z7.h, z5.h[7]\n" + "fmla z10.h, z1.h, z7.h[7]\n" + "fmla z14.h, z1.h, z6.h[7]\n" + "fmla z18.h, z1.h, z5.h[7]\n" + "fmla z22.h, z1.h, z4.h[7]\n" + "fmla z26.h, z1.h, z3.h[7]\n" + "fmla z30.h, z1.h, z2.h[7]\n" + "fmla z11.h, z0.h, z7.h[7]\n" + "fmla z15.h, z0.h, z6.h[7]\n" + "fmla z19.h, z0.h, z5.h[7]\n" + "fmla z23.h, z0.h, z4.h[7]\n" + "fmla z27.h, z0.h, z3.h[7]\n" + "fmla z31.h, z0.h, z2.h[7]\n" "bgt 80b\n" "81:" // Height 6: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -2933,275 +2933,275 @@ void sve_ffhybrid_fp16_mla_6x4VL ( "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" "ld1rqh { z5.h }, p0/Z, [x21]\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[0]\n" + "fmla z12.h, z7.h, z1.h[0]\n" + "fmla z16.h, z7.h, z2.h[0]\n" + "fmla z20.h, z7.h, z3.h[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[0]\n" + "fmla z28.h, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z9.h, z6.h, z0.h[0]\n" + "fmla z13.h, z6.h, z1.h[0]\n" + "fmla z17.h, z6.h, z2.h[0]\n" + "fmla z21.h, z6.h, z3.h[0]\n" + "fmla z25.h, z6.h, z4.h[0]\n" + "fmla z29.h, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z30.h, z6.h, z5.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "fmla z31.h, z7.h, z5.h[0]\n" + "fmla z10.h, z7.h, z0.h[0]\n" + "fmla z14.h, z7.h, z1.h[0]\n" + "fmla z18.h, z7.h, z2.h[0]\n" + "fmla z22.h, z7.h, z3.h[0]\n" + "fmla z26.h, z7.h, z4.h[0]\n" + "fmla z30.h, z7.h, z5.h[0]\n" + "fmla z11.h, z6.h, z0.h[0]\n" + "fmla z15.h, z6.h, z1.h[0]\n" + "fmla z19.h, z6.h, z2.h[0]\n" + "fmla z23.h, z6.h, z3.h[0]\n" + "fmla z27.h, z6.h, z4.h[0]\n" + "fmla z31.h, z6.h, z5.h[0]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[1]\n" + "fmla z12.h, z7.h, z1.h[1]\n" + "fmla z16.h, z7.h, z2.h[1]\n" + "fmla z20.h, z7.h, z3.h[1]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[1]\n" + "fmla z28.h, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" + "fmla z9.h, z6.h, z0.h[1]\n" + "fmla z13.h, z6.h, z1.h[1]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[1]\n" + "fmla z21.h, z6.h, z3.h[1]\n" + "fmla z25.h, z6.h, z4.h[1]\n" + "fmla z29.h, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z30.h, z6.h, z5.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "fmla z31.h, z7.h, z5.h[1]\n" + "fmla z10.h, z7.h, z0.h[1]\n" + "fmla z14.h, z7.h, z1.h[1]\n" + "fmla z18.h, z7.h, z2.h[1]\n" + "fmla z22.h, z7.h, z3.h[1]\n" + "fmla z26.h, z7.h, z4.h[1]\n" + "fmla z30.h, z7.h, z5.h[1]\n" + "fmla z11.h, z6.h, z0.h[1]\n" + "fmla z15.h, z6.h, z1.h[1]\n" + "fmla z19.h, z6.h, z2.h[1]\n" + "fmla z23.h, z6.h, z3.h[1]\n" + "fmla z27.h, z6.h, z4.h[1]\n" + "fmla z31.h, z6.h, z5.h[1]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[2]\n" + "fmla z12.h, z7.h, z1.h[2]\n" + "fmla z16.h, z7.h, z2.h[2]\n" + "fmla z20.h, z7.h, z3.h[2]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[2]\n" + "fmla z28.h, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" + "fmla z9.h, z6.h, z0.h[2]\n" + "fmla z13.h, z6.h, z1.h[2]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[2]\n" + "fmla z21.h, z6.h, z3.h[2]\n" + "fmla z25.h, z6.h, z4.h[2]\n" + "fmla z29.h, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z30.h, z6.h, z5.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "fmla z31.h, z7.h, z5.h[2]\n" + "fmla z10.h, z7.h, z0.h[2]\n" + "fmla z14.h, z7.h, z1.h[2]\n" + "fmla z18.h, z7.h, z2.h[2]\n" + "fmla z22.h, z7.h, z3.h[2]\n" + "fmla z26.h, z7.h, z4.h[2]\n" + "fmla z30.h, z7.h, z5.h[2]\n" + "fmla z11.h, z6.h, z0.h[2]\n" + "fmla z15.h, z6.h, z1.h[2]\n" + "fmla z19.h, z6.h, z2.h[2]\n" + "fmla z23.h, z6.h, z3.h[2]\n" + "fmla z27.h, z6.h, z4.h[2]\n" + "fmla z31.h, z6.h, z5.h[2]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[3]\n" + "fmla z12.h, z7.h, z1.h[3]\n" + "fmla z16.h, z7.h, z2.h[3]\n" + "fmla z20.h, z7.h, z3.h[3]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[3]\n" + "fmla z28.h, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" + "fmla z9.h, z6.h, z0.h[3]\n" + "fmla z13.h, z6.h, z1.h[3]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[3]\n" + "fmla z21.h, z6.h, z3.h[3]\n" + "fmla z25.h, z6.h, z4.h[3]\n" + "fmla z29.h, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z30.h, z6.h, z5.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "fmla z31.h, z7.h, z5.h[3]\n" + "fmla z10.h, z7.h, z0.h[3]\n" + "fmla z14.h, z7.h, z1.h[3]\n" + "fmla z18.h, z7.h, z2.h[3]\n" + "fmla z22.h, z7.h, z3.h[3]\n" + "fmla z26.h, z7.h, z4.h[3]\n" + "fmla z30.h, z7.h, z5.h[3]\n" + "fmla z11.h, z6.h, z0.h[3]\n" + "fmla z15.h, z6.h, z1.h[3]\n" + "fmla z19.h, z6.h, z2.h[3]\n" + "fmla z23.h, z6.h, z3.h[3]\n" + "fmla z27.h, z6.h, z4.h[3]\n" + "fmla z31.h, z6.h, z5.h[3]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[4]\n" + "fmla z12.h, z7.h, z1.h[4]\n" + "fmla z16.h, z7.h, z2.h[4]\n" + "fmla z20.h, z7.h, z3.h[4]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[4]\n" + "fmla z28.h, z7.h, z5.h[4]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" + "fmla z9.h, z6.h, z0.h[4]\n" + "fmla z13.h, z6.h, z1.h[4]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[4]\n" + "fmla z21.h, z6.h, z3.h[4]\n" + "fmla z25.h, z6.h, z4.h[4]\n" + "fmla z29.h, z6.h, z5.h[4]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z30.h, z6.h, z5.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "fmla z31.h, z7.h, z5.h[4]\n" + "fmla z10.h, z7.h, z0.h[4]\n" + "fmla z14.h, z7.h, z1.h[4]\n" + "fmla z18.h, z7.h, z2.h[4]\n" + "fmla z22.h, z7.h, z3.h[4]\n" + "fmla z26.h, z7.h, z4.h[4]\n" + "fmla z30.h, z7.h, z5.h[4]\n" + "fmla z11.h, z6.h, z0.h[4]\n" + "fmla z15.h, z6.h, z1.h[4]\n" + "fmla z19.h, z6.h, z2.h[4]\n" + "fmla z23.h, z6.h, z3.h[4]\n" + "fmla z27.h, z6.h, z4.h[4]\n" + "fmla z31.h, z6.h, z5.h[4]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[5]\n" + "fmla z12.h, z7.h, z1.h[5]\n" + "fmla z16.h, z7.h, z2.h[5]\n" + "fmla z20.h, z7.h, z3.h[5]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[5]\n" + "fmla z28.h, z7.h, z5.h[5]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" + "fmla z9.h, z6.h, z0.h[5]\n" + "fmla z13.h, z6.h, z1.h[5]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[5]\n" + "fmla z21.h, z6.h, z3.h[5]\n" + "fmla z25.h, z6.h, z4.h[5]\n" + "fmla z29.h, z6.h, z5.h[5]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z30.h, z6.h, z5.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "fmla z31.h, z7.h, z5.h[5]\n" + "fmla z10.h, z7.h, z0.h[5]\n" + "fmla z14.h, z7.h, z1.h[5]\n" + "fmla z18.h, z7.h, z2.h[5]\n" + "fmla z22.h, z7.h, z3.h[5]\n" + "fmla z26.h, z7.h, z4.h[5]\n" + "fmla z30.h, z7.h, z5.h[5]\n" + "fmla z11.h, z6.h, z0.h[5]\n" + "fmla z15.h, z6.h, z1.h[5]\n" + "fmla z19.h, z6.h, z2.h[5]\n" + "fmla z23.h, z6.h, z3.h[5]\n" + "fmla z27.h, z6.h, z4.h[5]\n" + "fmla z31.h, z6.h, z5.h[5]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[6]\n" + "fmla z12.h, z7.h, z1.h[6]\n" + "fmla z16.h, z7.h, z2.h[6]\n" + "fmla z20.h, z7.h, z3.h[6]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[6]\n" + "fmla z28.h, z7.h, z5.h[6]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" + "fmla z9.h, z6.h, z0.h[6]\n" + "fmla z13.h, z6.h, z1.h[6]\n" "addvl x10, x10, #1\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z17.h, z6.h, z2.h[6]\n" + "fmla z21.h, z6.h, z3.h[6]\n" + "fmla z25.h, z6.h, z4.h[6]\n" + "fmla z29.h, z6.h, z5.h[6]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z30.h, z6.h, z5.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "fmla z31.h, z7.h, z5.h[6]\n" + "fmla z10.h, z7.h, z0.h[6]\n" + "fmla z14.h, z7.h, z1.h[6]\n" + "fmla z18.h, z7.h, z2.h[6]\n" + "fmla z22.h, z7.h, z3.h[6]\n" + "fmla z26.h, z7.h, z4.h[6]\n" + "fmla z30.h, z7.h, z5.h[6]\n" + "fmla z11.h, z6.h, z0.h[6]\n" + "fmla z15.h, z6.h, z1.h[6]\n" + "fmla z19.h, z6.h, z2.h[6]\n" + "fmla z23.h, z6.h, z3.h[6]\n" + "fmla z27.h, z6.h, z4.h[6]\n" + "fmla z31.h, z6.h, z5.h[6]\n" "ble 82f\n" - "ld1h { z6.h }, p5/Z, [x12]\n" - "ld1h { z7.h }, p5/Z, [x11]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" + "ld1h { z7.h }, p5/Z, [x12]\n" + "ld1h { z6.h }, p5/Z, [x11]\n" + "fmla z8.h, z7.h, z0.h[7]\n" + "fmla z12.h, z7.h, z1.h[7]\n" + "fmla z16.h, z7.h, z2.h[7]\n" + "fmla z20.h, z7.h, z3.h[7]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" + "fmla z24.h, z7.h, z4.h[7]\n" + "fmla z28.h, z7.h, z5.h[7]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x9]\n" + "fmla z9.h, z6.h, z0.h[7]\n" + "fmla z13.h, z6.h, z1.h[7]\n" + "fmla z17.h, z6.h, z2.h[7]\n" + "fmla z21.h, z6.h, z3.h[7]\n" + "fmla z25.h, z6.h, z4.h[7]\n" + "fmla z29.h, z6.h, z5.h[7]\n" + "ld1h { z6.h }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z30.h, z6.h, z5.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" - "fmla z31.h, z7.h, z5.h[7]\n" + "fmla z10.h, z7.h, z0.h[7]\n" + "fmla z14.h, z7.h, z1.h[7]\n" + "fmla z18.h, z7.h, z2.h[7]\n" + "fmla z22.h, z7.h, z3.h[7]\n" + "fmla z26.h, z7.h, z4.h[7]\n" + "fmla z30.h, z7.h, z5.h[7]\n" + "fmla z11.h, z6.h, z0.h[7]\n" + "fmla z15.h, z6.h, z1.h[7]\n" + "fmla z19.h, z6.h, z2.h[7]\n" + "fmla z23.h, z6.h, z3.h[7]\n" + "fmla z27.h, z6.h, z4.h[7]\n" + "fmla z31.h, z6.h, z5.h[7]\n" "82:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3315,4 +3315,4 @@ void sve_ffhybrid_fp16_mla_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp index b4c124c1e3..3a93a2f7c8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp index 32fcac3a45..8e4fd4388e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/a64fx.cpp @@ -163,11 +163,11 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -183,12 +183,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "10:" // Height 1: Multiply loop: Main loop "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p4/Z, [x10]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" "add x26, x26, #0x4\n" "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -201,12 +201,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p4/Z, [x10]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" @@ -214,17 +214,17 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "bne 7b\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z17.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z17.s\n" + "fmin z9.s, p4/M, z9.s, z17.s\n" + "fmin z10.s, p4/M, z10.s, z17.s\n" + "fmin z11.s, p4/M, z11.s, z17.s\n" + "fmax z8.s, p4/M, z8.s, z16.s\n" + "fmax z9.s, p4/M, z9.s, z16.s\n" + "fmax z10.s, p4/M, z10.s, z16.s\n" + "fmax z11.s, p4/M, z11.s, z16.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p3, [x13]\n" "st1w { z9.s }, p2, [x13, #1, MUL VL]\n" @@ -285,15 +285,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "17:" // Height 2: no bias "tbz %x[flags], #0, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" + "add x20, x13, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x13]\n" "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x20]\n" + "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n" "b 19f\n" "18:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -309,12 +309,12 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "20:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 21f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 22f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -322,7 +322,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "b 22f\n" "21:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "22:" // Height 2: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -333,19 +333,19 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "23:" // Height 2: Multiply loop: Main loop "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z12.s, p4/M, z6.s, z1.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z17.s }, p4/Z, [x10]\n" "addvl x12, x12, #1\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" "addvl x11, x11, #1\n" "add x26, x26, #0x4\n" "subs x27, x27, #0x1\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z14.s, p4/M, z17.s, z1.s\n" "add x25, x25, #0x4\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" + "fmla z15.s, p4/M, z16.s, z1.s\n" "addvl x10, x10, #1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" @@ -357,18 +357,18 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z12.s, p4/M, z6.s, z1.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z17.s }, p4/Z, [x10]\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z14.s, p4/M, z17.s, z1.s\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" + "fmla z15.s, p4/M, z16.s, z1.s\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "bne 20b\n" @@ -376,25 +376,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x25, x13, x20, LSL #2\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z17.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" + "ld1rw { z16.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z17.s\n" + "fmin z9.s, p4/M, z9.s, z17.s\n" + "fmin z10.s, p4/M, z10.s, z17.s\n" + "fmin z11.s, p4/M, z11.s, z17.s\n" + "fmin z12.s, p4/M, z12.s, z17.s\n" + "fmin z13.s, p4/M, z13.s, z17.s\n" + "fmin z14.s, p4/M, z14.s, z17.s\n" + "fmin z15.s, p4/M, z15.s, z17.s\n" + "fmax z8.s, p4/M, z8.s, z16.s\n" + "fmax z9.s, p4/M, z9.s, z16.s\n" + "fmax z10.s, p4/M, z10.s, z16.s\n" + "fmax z11.s, p4/M, z11.s, z16.s\n" + "fmax z12.s, p4/M, z12.s, z16.s\n" + "fmax z13.s, p4/M, z13.s, z16.s\n" + "fmax z14.s, p4/M, z14.s, z16.s\n" + "fmax z15.s, p4/M, z15.s, z16.s\n" "25:" // Height 2: No activation "st1w { z8.s }, p3, [x13]\n" "st1w { z9.s }, p2, [x13, #1, MUL VL]\n" @@ -463,20 +463,20 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "30:" // Height 3: no bias "tbz %x[flags], #0, 31f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x21, x13, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x13]\n" "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x21]\n" + "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x20]\n" + "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n" "b 32f\n" "31:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -496,13 +496,13 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "33:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 34f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 35f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -511,8 +511,8 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "b 35f\n" "34:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "35:" // Height 3: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -528,22 +528,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z21.s }, p4/Z, [x10]\n" "add x26, x26, #0x4\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z20.s }, p4/Z, [x9]\n" "subs x27, x27, #0x1\n" "add x25, x25, #0x4\n" "add x24, x24, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z21.s, z0.s\n" + "fmla z14.s, p4/M, z21.s, z1.s\n" + "fmla z18.s, p4/M, z21.s, z2.s\n" + "fmla z11.s, p4/M, z20.s, z0.s\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z15.s, p4/M, z20.s, z1.s\n" + "fmla z19.s, p4/M, z20.s, z2.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" @@ -557,54 +557,54 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z21.s }, p4/Z, [x10]\n" "cmp x28, x20\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z20.s }, p4/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z21.s, z0.s\n" + "fmla z14.s, p4/M, z21.s, z1.s\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z18.s, p4/M, z21.s, z2.s\n" + "fmla z11.s, p4/M, z20.s, z0.s\n" "addvl x9, x9, #1\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z15.s, p4/M, z20.s, z1.s\n" + "fmla z19.s, p4/M, z20.s, z2.s\n" "bne 33b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #2\n" "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z21.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" + "ld1rw { z20.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z21.s\n" + "fmin z9.s, p4/M, z9.s, z21.s\n" + "fmin z10.s, p4/M, z10.s, z21.s\n" + "fmin z11.s, p4/M, z11.s, z21.s\n" + "fmin z12.s, p4/M, z12.s, z21.s\n" + "fmin z13.s, p4/M, z13.s, z21.s\n" + "fmin z14.s, p4/M, z14.s, z21.s\n" + "fmin z15.s, p4/M, z15.s, z21.s\n" + "fmin z16.s, p4/M, z16.s, z21.s\n" + "fmin z17.s, p4/M, z17.s, z21.s\n" + "fmin z18.s, p4/M, z18.s, z21.s\n" + "fmin z19.s, p4/M, z19.s, z21.s\n" + "fmax z8.s, p4/M, z8.s, z20.s\n" + "fmax z9.s, p4/M, z9.s, z20.s\n" + "fmax z10.s, p4/M, z10.s, z20.s\n" + "fmax z11.s, p4/M, z11.s, z20.s\n" + "fmax z12.s, p4/M, z12.s, z20.s\n" + "fmax z13.s, p4/M, z13.s, z20.s\n" + "fmax z14.s, p4/M, z14.s, z20.s\n" + "fmax z15.s, p4/M, z15.s, z20.s\n" + "fmax z16.s, p4/M, z16.s, z20.s\n" + "fmax z17.s, p4/M, z17.s, z20.s\n" + "fmax z18.s, p4/M, z18.s, z20.s\n" + "fmax z19.s, p4/M, z19.s, z20.s\n" "38:" // Height 3: No activation "st1w { z8.s }, p3, [x13]\n" "st1w { z9.s }, p2, [x13, #1, MUL VL]\n" @@ -681,25 +681,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "43:" // Height 4: no bias "tbz %x[flags], #0, 44f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x22, x13, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x22]\n" + "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x21]\n" + "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x20]\n" + "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n" "b 45f\n" "44:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -723,14 +723,14 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -740,9 +740,9 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "b 48f\n" "47:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "48:" // Height 4: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -759,7 +759,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z20.s, p4/M, z6.s, z3.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z25.s }, p4/Z, [x10]\n" "add x26, x26, #0x4\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" @@ -767,22 +767,22 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x25, x25, #0x4\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "fmla z21.s, p4/M, z7.s, z3.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z24.s }, p4/Z, [x9]\n" "add x24, x24, #0x4\n" "add x23, x23, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z25.s, z0.s\n" + "fmla z14.s, p4/M, z25.s, z1.s\n" "addvl x10, x10, #1\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z18.s, p4/M, z25.s, z2.s\n" + "fmla z22.s, p4/M, z25.s, z3.s\n" "addvl x9, x9, #1\n" "ld1w { z6.s }, p4/Z, [x12]\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z11.s, p4/M, z24.s, z0.s\n" + "fmla z15.s, p4/M, z24.s, z1.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z19.s, p4/M, z24.s, z2.s\n" + "fmla z23.s, p4/M, z24.s, z3.s\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1w { z7.s }, p4/Z, [x11]\n" @@ -794,7 +794,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z20.s, p4/M, z6.s, z3.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z25.s }, p4/Z, [x10]\n" "cmp x28, x20\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" @@ -802,17 +802,17 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "addvl x11, x11, #1\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "fmla z21.s, p4/M, z7.s, z3.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z24.s }, p4/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z25.s, z0.s\n" + "fmla z14.s, p4/M, z25.s, z1.s\n" "addvl x9, x9, #1\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z18.s, p4/M, z25.s, z2.s\n" + "fmla z22.s, p4/M, z25.s, z3.s\n" + "fmla z11.s, p4/M, z24.s, z0.s\n" + "fmla z15.s, p4/M, z24.s, z1.s\n" + "fmla z19.s, p4/M, z24.s, z2.s\n" + "fmla z23.s, p4/M, z24.s, z3.s\n" "bne 46b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #2\n" @@ -820,41 +820,41 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z25.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmin z20.s, p4/M, z20.s, z1.s\n" - "fmin z21.s, p4/M, z21.s, z1.s\n" - "fmin z22.s, p4/M, z22.s, z1.s\n" - "fmin z23.s, p4/M, z23.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" - "fmax z20.s, p4/M, z20.s, z0.s\n" - "fmax z21.s, p4/M, z21.s, z0.s\n" - "fmax z22.s, p4/M, z22.s, z0.s\n" - "fmax z23.s, p4/M, z23.s, z0.s\n" + "ld1rw { z24.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z25.s\n" + "fmin z9.s, p4/M, z9.s, z25.s\n" + "fmin z10.s, p4/M, z10.s, z25.s\n" + "fmin z11.s, p4/M, z11.s, z25.s\n" + "fmin z12.s, p4/M, z12.s, z25.s\n" + "fmin z13.s, p4/M, z13.s, z25.s\n" + "fmin z14.s, p4/M, z14.s, z25.s\n" + "fmin z15.s, p4/M, z15.s, z25.s\n" + "fmin z16.s, p4/M, z16.s, z25.s\n" + "fmin z17.s, p4/M, z17.s, z25.s\n" + "fmin z18.s, p4/M, z18.s, z25.s\n" + "fmin z19.s, p4/M, z19.s, z25.s\n" + "fmin z20.s, p4/M, z20.s, z25.s\n" + "fmin z21.s, p4/M, z21.s, z25.s\n" + "fmin z22.s, p4/M, z22.s, z25.s\n" + "fmin z23.s, p4/M, z23.s, z25.s\n" + "fmax z8.s, p4/M, z8.s, z24.s\n" + "fmax z9.s, p4/M, z9.s, z24.s\n" + "fmax z10.s, p4/M, z10.s, z24.s\n" + "fmax z11.s, p4/M, z11.s, z24.s\n" + "fmax z12.s, p4/M, z12.s, z24.s\n" + "fmax z13.s, p4/M, z13.s, z24.s\n" + "fmax z14.s, p4/M, z14.s, z24.s\n" + "fmax z15.s, p4/M, z15.s, z24.s\n" + "fmax z16.s, p4/M, z16.s, z24.s\n" + "fmax z17.s, p4/M, z17.s, z24.s\n" + "fmax z18.s, p4/M, z18.s, z24.s\n" + "fmax z19.s, p4/M, z19.s, z24.s\n" + "fmax z20.s, p4/M, z20.s, z24.s\n" + "fmax z21.s, p4/M, z21.s, z24.s\n" + "fmax z22.s, p4/M, z22.s, z24.s\n" + "fmax z23.s, p4/M, z23.s, z24.s\n" "51:" // Height 4: No activation "st1w { z8.s }, p3, [x13]\n" "st1w { z9.s }, p2, [x13, #1, MUL VL]\n" @@ -939,30 +939,30 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "56:" // Height 5: no bias "tbz %x[flags], #0, 57f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x13, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x13]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" "b 58f\n" "57:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -990,15 +990,15 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "59:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 60f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 61f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1009,10 +1009,10 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "b 61f\n" "60:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "61:" // Height 5: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -1034,7 +1034,7 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z24.s, p4/M, z6.s, z4.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z29.s }, p4/Z, [x10]\n" "add x25, x25, #0x4\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" @@ -1042,24 +1042,24 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x23, x23, #0x4\n" "fmla z21.s, p4/M, z7.s, z3.s\n" "fmla z25.s, p4/M, z7.s, z4.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z28.s }, p4/Z, [x9]\n" "add x22, x22, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z29.s, z0.s\n" + "fmla z14.s, p4/M, z29.s, z1.s\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z26.s, p4/M, z6.s, z4.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z18.s, p4/M, z29.s, z2.s\n" + "fmla z22.s, p4/M, z29.s, z3.s\n" + "fmla z26.s, p4/M, z29.s, z4.s\n" + "fmla z11.s, p4/M, z28.s, z0.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1w { z6.s }, p4/Z, [x12]\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z15.s, p4/M, z28.s, z1.s\n" + "fmla z19.s, p4/M, z28.s, z2.s\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" - "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z23.s, p4/M, z28.s, z3.s\n" + "fmla z27.s, p4/M, z28.s, z4.s\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1rw { z4.s }, p4/Z, [x22]\n" "ld1w { z7.s }, p4/Z, [x11]\n" @@ -1075,25 +1075,25 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "addvl x12, x12, #1\n" "fmla z24.s, p4/M, z6.s, z4.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10]\n" + "ld1w { z29.s }, p4/Z, [x10]\n" "addvl x11, x11, #1\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "addvl x10, x10, #1\n" "fmla z21.s, p4/M, z7.s, z3.s\n" "fmla z25.s, p4/M, z7.s, z4.s\n" - "ld1w { z7.s }, p4/Z, [x9]\n" + "ld1w { z28.s }, p4/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z26.s, p4/M, z6.s, z4.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" - "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z10.s, p4/M, z29.s, z0.s\n" + "fmla z14.s, p4/M, z29.s, z1.s\n" + "fmla z18.s, p4/M, z29.s, z2.s\n" + "fmla z22.s, p4/M, z29.s, z3.s\n" + "fmla z26.s, p4/M, z29.s, z4.s\n" + "fmla z11.s, p4/M, z28.s, z0.s\n" + "fmla z15.s, p4/M, z28.s, z1.s\n" + "fmla z19.s, p4/M, z28.s, z2.s\n" + "fmla z23.s, p4/M, z28.s, z3.s\n" + "fmla z27.s, p4/M, z28.s, z4.s\n" "bne 59b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x13, x20, LSL #2\n" @@ -1102,49 +1102,49 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z29.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmin z20.s, p4/M, z20.s, z1.s\n" - "fmin z21.s, p4/M, z21.s, z1.s\n" - "fmin z22.s, p4/M, z22.s, z1.s\n" - "fmin z23.s, p4/M, z23.s, z1.s\n" - "fmin z24.s, p4/M, z24.s, z1.s\n" - "fmin z25.s, p4/M, z25.s, z1.s\n" - "fmin z26.s, p4/M, z26.s, z1.s\n" - "fmin z27.s, p4/M, z27.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" - "fmax z20.s, p4/M, z20.s, z0.s\n" - "fmax z21.s, p4/M, z21.s, z0.s\n" - "fmax z22.s, p4/M, z22.s, z0.s\n" - "fmax z23.s, p4/M, z23.s, z0.s\n" - "fmax z24.s, p4/M, z24.s, z0.s\n" - "fmax z25.s, p4/M, z25.s, z0.s\n" - "fmax z26.s, p4/M, z26.s, z0.s\n" - "fmax z27.s, p4/M, z27.s, z0.s\n" + "ld1rw { z28.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z29.s\n" + "fmin z9.s, p4/M, z9.s, z29.s\n" + "fmin z10.s, p4/M, z10.s, z29.s\n" + "fmin z11.s, p4/M, z11.s, z29.s\n" + "fmin z12.s, p4/M, z12.s, z29.s\n" + "fmin z13.s, p4/M, z13.s, z29.s\n" + "fmin z14.s, p4/M, z14.s, z29.s\n" + "fmin z15.s, p4/M, z15.s, z29.s\n" + "fmin z16.s, p4/M, z16.s, z29.s\n" + "fmin z17.s, p4/M, z17.s, z29.s\n" + "fmin z18.s, p4/M, z18.s, z29.s\n" + "fmin z19.s, p4/M, z19.s, z29.s\n" + "fmin z20.s, p4/M, z20.s, z29.s\n" + "fmin z21.s, p4/M, z21.s, z29.s\n" + "fmin z22.s, p4/M, z22.s, z29.s\n" + "fmin z23.s, p4/M, z23.s, z29.s\n" + "fmin z24.s, p4/M, z24.s, z29.s\n" + "fmin z25.s, p4/M, z25.s, z29.s\n" + "fmin z26.s, p4/M, z26.s, z29.s\n" + "fmin z27.s, p4/M, z27.s, z29.s\n" + "fmax z8.s, p4/M, z8.s, z28.s\n" + "fmax z9.s, p4/M, z9.s, z28.s\n" + "fmax z10.s, p4/M, z10.s, z28.s\n" + "fmax z11.s, p4/M, z11.s, z28.s\n" + "fmax z12.s, p4/M, z12.s, z28.s\n" + "fmax z13.s, p4/M, z13.s, z28.s\n" + "fmax z14.s, p4/M, z14.s, z28.s\n" + "fmax z15.s, p4/M, z15.s, z28.s\n" + "fmax z16.s, p4/M, z16.s, z28.s\n" + "fmax z17.s, p4/M, z17.s, z28.s\n" + "fmax z18.s, p4/M, z18.s, z28.s\n" + "fmax z19.s, p4/M, z19.s, z28.s\n" + "fmax z20.s, p4/M, z20.s, z28.s\n" + "fmax z21.s, p4/M, z21.s, z28.s\n" + "fmax z22.s, p4/M, z22.s, z28.s\n" + "fmax z23.s, p4/M, z23.s, z28.s\n" + "fmax z24.s, p4/M, z24.s, z28.s\n" + "fmax z25.s, p4/M, z25.s, z28.s\n" + "fmax z26.s, p4/M, z26.s, z28.s\n" + "fmax z27.s, p4/M, z27.s, z28.s\n" "64:" // Height 5: No activation "st1w { z8.s }, p3, [x13]\n" "st1w { z9.s }, p2, [x13, #1, MUL VL]\n" @@ -1240,35 +1240,35 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "69:" // Height 6: no bias "tbz %x[flags], #0, 70f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x13]\n" + "add x24, x13, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x13]\n" "add x22, x23, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x13, #2, MUL VL]\n" - "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z11.s }, p0/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p3/Z, [x21]\n" - "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x22]\n" + "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21]\n" + "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p3/Z, [x20]\n" + "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n" "b 71f\n" "70:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1300,16 +1300,16 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "72:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 73f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 74f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1321,11 +1321,11 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( "b 74f\n" "73:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "74:" // Height 6: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -1527,4 +1527,4 @@ void sve_ffhybrid_fp32_mla_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp index eb057e7734..b1ab31e618 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32_mla_6x4VL/generic.cpp @@ -163,11 +163,11 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -180,40 +180,40 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "10:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x12]\n" + "fmla z8.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10]\n" + "fmla z10.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z11.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z8.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z10.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z11.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z8.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z10.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z11.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z8.s, z16.s, z0.s[3]\n" + "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n" "sub x27, x27, #0x4\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n" "cmp x27, #0x4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" "add x26, x26, #0x10\n" "addvl x12, x12, #4\n" "addvl x11, x11, #4\n" @@ -223,56 +223,56 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "11:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x12]\n" + "fmla z8.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z10.s, z17.s, z0.s[0]\n" + "fmla z11.s, z16.s, z0.s[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[1]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z10.s, z17.s, z0.s[1]\n" + "fmla z11.s, z16.s, z0.s[1]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[2]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z10.s, z17.s, z0.s[2]\n" + "fmla z11.s, z16.s, z0.s[2]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 12f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[3]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" @@ -284,17 +284,17 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "bne 7b\n" "tbz %x[flags], #1, 13f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "13:" // Height 1: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -355,15 +355,15 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "18:" // Height 2: no bias "tbz %x[flags], #0, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" + "add x20, x13, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x13]\n" "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" "b 20f\n" "19:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -379,12 +379,12 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "21:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 22f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -392,143 +392,143 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "b 23f\n" "22:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "23:" // Height 2: input setup done "cmp x27, #0x4\n" "ble 25f\n" "24:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z1.s[0]\n" + "fmla z12.s, z17.s, z0.s[0]\n" + "fmla z9.s, z16.s, z1.s[0]\n" + "fmla z13.s, z16.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z10.s, z17.s, z1.s[0]\n" + "fmla z14.s, z17.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x12, #1, MUL VL]\n" "cmp x27, #0x4\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[0]\n" + "fmla z15.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x11, #1, MUL VL]\n" "add x26, x26, #0x10\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[1]\n" + "fmla z12.s, z17.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #1, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[1]\n" + "fmla z13.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.s, z17.s, z1.s[1]\n" + "fmla z14.s, z17.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[1]\n" + "fmla z15.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[2]\n" + "fmla z12.s, z17.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[2]\n" + "fmla z13.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.s, z17.s, z1.s[2]\n" + "fmla z14.s, z17.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x12, #3, MUL VL]\n" "addvl x12, x12, #4\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[2]\n" + "fmla z15.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[3]\n" + "fmla z12.s, z17.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[3]\n" + "fmla z13.s, z16.s, z0.s[3]\n" + "ld1w { z16.s }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z10.s, z17.s, z1.s[3]\n" + "fmla z14.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z1.s[3]\n" + "fmla z15.s, z16.s, z0.s[3]\n" "bgt 24b\n" "25:" // Height 2: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" "ld1rqw { z1.s }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[0]\n" + "fmla z12.s, z17.s, z1.s[0]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "fmla z13.s, z16.s, z1.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z10.s, z17.s, z0.s[0]\n" + "fmla z14.s, z17.s, z1.s[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z11.s, z16.s, z0.s[0]\n" + "fmla z15.s, z16.s, z1.s[0]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[1]\n" + "fmla z12.s, z17.s, z1.s[1]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "fmla z13.s, z16.s, z1.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z10.s, z17.s, z0.s[1]\n" + "fmla z14.s, z17.s, z1.s[1]\n" "addvl x12, x12, #1\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z11.s, z16.s, z0.s[1]\n" + "fmla z15.s, z16.s, z1.s[1]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[2]\n" + "fmla z12.s, z17.s, z1.s[2]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "fmla z13.s, z16.s, z1.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z10.s, z17.s, z0.s[2]\n" + "fmla z14.s, z17.s, z1.s[2]\n" "addvl x12, x12, #1\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z11.s, z16.s, z0.s[2]\n" + "fmla z15.s, z16.s, z1.s[2]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "ble 26f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" + "ld1w { z17.s }, p5/Z, [x12]\n" + "ld1w { z16.s }, p5/Z, [x11]\n" + "fmla z8.s, z17.s, z0.s[3]\n" + "fmla z12.s, z17.s, z1.s[3]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "fmla z13.s, z16.s, z1.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x9]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z14.s, z17.s, z1.s[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" + "fmla z15.s, z16.s, z1.s[3]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" "26:" // Height 2: Multiply loop: multiply skip @@ -540,25 +540,25 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "add x25, x13, x20, LSL #2\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z15.s, p5/M, z15.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z15.s, p5/M, z15.s, z16.s\n" "27:" // Height 2: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -627,20 +627,20 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "32:" // Height 3: no bias "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x21, x13, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x13]\n" "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20]\n" + "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n" "b 34f\n" "33:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -660,13 +660,13 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "35:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -675,89 +675,89 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "b 37f\n" "36:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "37:" // Height 3: input setup done "cmp x27, #0x4\n" "ble 39f\n" "38:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" "ld1rqw { z1.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1rqw { z0.s }, p0/Z, [x24]\n" + "ld1w { z21.s }, p5/Z, [x12]\n" + "fmla z8.s, z21.s, z2.s[0]\n" + "fmla z12.s, z21.s, z1.s[0]\n" + "ld1w { z20.s }, p5/Z, [x11]\n" + "fmla z16.s, z21.s, z0.s[0]\n" + "fmla z9.s, z20.s, z2.s[0]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "fmla z13.s, z20.s, z1.s[0]\n" + "fmla z17.s, z20.s, z0.s[0]\n" + "ld1w { z20.s }, p5/Z, [x9]\n" "cmp x27, #0x4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z10.s, z21.s, z2.s[0]\n" + "fmla z14.s, z21.s, z1.s[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z18.s, z21.s, z0.s[0]\n" + "fmla z11.s, z20.s, z2.s[0]\n" + "ld1w { z21.s }, p5/Z, [x12, #1, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[0]\n" + "fmla z19.s, z20.s, z0.s[0]\n" + "ld1w { z20.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[1]\n" + "fmla z12.s, z21.s, z1.s[1]\n" + "fmla z16.s, z21.s, z0.s[1]\n" + "fmla z9.s, z20.s, z2.s[1]\n" + "ld1w { z21.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[1]\n" + "fmla z17.s, z20.s, z0.s[1]\n" + "ld1w { z20.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.s, z21.s, z2.s[1]\n" + "fmla z14.s, z21.s, z1.s[1]\n" + "fmla z18.s, z21.s, z0.s[1]\n" + "fmla z11.s, z20.s, z2.s[1]\n" + "ld1w { z21.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[1]\n" + "fmla z19.s, z20.s, z0.s[1]\n" + "ld1w { z20.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[2]\n" + "fmla z12.s, z21.s, z1.s[2]\n" + "fmla z16.s, z21.s, z0.s[2]\n" + "fmla z9.s, z20.s, z2.s[2]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[2]\n" + "fmla z17.s, z20.s, z0.s[2]\n" + "ld1w { z20.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.s, z21.s, z2.s[2]\n" + "fmla z14.s, z21.s, z1.s[2]\n" + "fmla z18.s, z21.s, z0.s[2]\n" + "fmla z11.s, z20.s, z2.s[2]\n" + "ld1w { z21.s }, p5/Z, [x12, #3, MUL VL]\n" "addvl x12, x12, #4\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[2]\n" + "fmla z19.s, z20.s, z0.s[2]\n" + "ld1w { z20.s }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[3]\n" + "fmla z12.s, z21.s, z1.s[3]\n" + "fmla z16.s, z21.s, z0.s[3]\n" + "fmla z9.s, z20.s, z2.s[3]\n" + "ld1w { z21.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[3]\n" + "fmla z17.s, z20.s, z0.s[3]\n" + "ld1w { z20.s }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z10.s, z21.s, z2.s[3]\n" + "fmla z14.s, z21.s, z1.s[3]\n" + "fmla z18.s, z21.s, z0.s[3]\n" + "fmla z11.s, z20.s, z2.s[3]\n" + "fmla z15.s, z20.s, z1.s[3]\n" + "fmla z19.s, z20.s, z0.s[3]\n" "bgt 38b\n" "39:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -765,91 +765,91 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "ld1rqw { z1.s }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "ld1w { z21.s }, p5/Z, [x12]\n" + "fmla z8.s, z21.s, z0.s[0]\n" + "fmla z12.s, z21.s, z1.s[0]\n" + "ld1w { z20.s }, p5/Z, [x11]\n" + "fmla z16.s, z21.s, z2.s[0]\n" + "fmla z9.s, z20.s, z0.s[0]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "fmla z13.s, z20.s, z1.s[0]\n" + "fmla z17.s, z20.s, z2.s[0]\n" + "ld1w { z20.s }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z10.s, z21.s, z0.s[0]\n" + "fmla z14.s, z21.s, z1.s[0]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" + "fmla z18.s, z21.s, z2.s[0]\n" + "fmla z11.s, z20.s, z0.s[0]\n" "addvl x9, x9, #1\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z15.s, z20.s, z1.s[0]\n" + "fmla z19.s, z20.s, z2.s[0]\n" "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z21.s }, p5/Z, [x12]\n" + "ld1w { z20.s }, p5/Z, [x11]\n" + "fmla z8.s, z21.s, z0.s[1]\n" + "fmla z12.s, z21.s, z1.s[1]\n" + "fmla z16.s, z21.s, z2.s[1]\n" + "fmla z9.s, z20.s, z0.s[1]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z13.s, z20.s, z1.s[1]\n" + "fmla z17.s, z20.s, z2.s[1]\n" + "ld1w { z20.s }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z10.s, z21.s, z0.s[1]\n" + "fmla z14.s, z21.s, z1.s[1]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z18.s, z21.s, z2.s[1]\n" + "fmla z11.s, z20.s, z0.s[1]\n" "addvl x9, x9, #1\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z15.s, z20.s, z1.s[1]\n" + "fmla z19.s, z20.s, z2.s[1]\n" "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z21.s }, p5/Z, [x12]\n" + "ld1w { z20.s }, p5/Z, [x11]\n" + "fmla z8.s, z21.s, z0.s[2]\n" + "fmla z12.s, z21.s, z1.s[2]\n" + "fmla z16.s, z21.s, z2.s[2]\n" + "fmla z9.s, z20.s, z0.s[2]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z13.s, z20.s, z1.s[2]\n" + "fmla z17.s, z20.s, z2.s[2]\n" + "ld1w { z20.s }, p5/Z, [x9]\n" "addvl x12, x12, #1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z10.s, z21.s, z0.s[2]\n" + "fmla z14.s, z21.s, z1.s[2]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z18.s, z21.s, z2.s[2]\n" + "fmla z11.s, z20.s, z0.s[2]\n" "addvl x9, x9, #1\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z15.s, z20.s, z1.s[2]\n" + "fmla z19.s, z20.s, z2.s[2]\n" "ble 40f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z21.s }, p5/Z, [x12]\n" + "ld1w { z20.s }, p5/Z, [x11]\n" + "fmla z8.s, z21.s, z0.s[3]\n" + "fmla z12.s, z21.s, z1.s[3]\n" + "fmla z16.s, z21.s, z2.s[3]\n" + "fmla z9.s, z20.s, z0.s[3]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z13.s, z20.s, z1.s[3]\n" + "fmla z17.s, z20.s, z2.s[3]\n" + "ld1w { z20.s }, p5/Z, [x9]\n" "addvl x11, x11, #1\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" + "fmla z10.s, z21.s, z0.s[3]\n" + "fmla z14.s, z21.s, z1.s[3]\n" "addvl x10, x10, #1\n" "addvl x9, x9, #1\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z18.s, z21.s, z2.s[3]\n" + "fmla z11.s, z20.s, z0.s[3]\n" + "fmla z15.s, z20.s, z1.s[3]\n" + "fmla z19.s, z20.s, z2.s[3]\n" "40:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -860,33 +860,33 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 41f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z21.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z20.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z21.s\n" + "fmin z9.s, p5/M, z9.s, z21.s\n" + "fmin z10.s, p5/M, z10.s, z21.s\n" + "fmin z11.s, p5/M, z11.s, z21.s\n" + "fmin z12.s, p5/M, z12.s, z21.s\n" + "fmin z13.s, p5/M, z13.s, z21.s\n" + "fmin z14.s, p5/M, z14.s, z21.s\n" + "fmin z15.s, p5/M, z15.s, z21.s\n" + "fmin z16.s, p5/M, z16.s, z21.s\n" + "fmin z17.s, p5/M, z17.s, z21.s\n" + "fmin z18.s, p5/M, z18.s, z21.s\n" + "fmin z19.s, p5/M, z19.s, z21.s\n" + "fmax z8.s, p5/M, z8.s, z20.s\n" + "fmax z9.s, p5/M, z9.s, z20.s\n" + "fmax z10.s, p5/M, z10.s, z20.s\n" + "fmax z11.s, p5/M, z11.s, z20.s\n" + "fmax z12.s, p5/M, z12.s, z20.s\n" + "fmax z13.s, p5/M, z13.s, z20.s\n" + "fmax z14.s, p5/M, z14.s, z20.s\n" + "fmax z15.s, p5/M, z15.s, z20.s\n" + "fmax z16.s, p5/M, z16.s, z20.s\n" + "fmax z17.s, p5/M, z17.s, z20.s\n" + "fmax z18.s, p5/M, z18.s, z20.s\n" + "fmax z19.s, p5/M, z19.s, z20.s\n" "41:" // Height 3: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -963,25 +963,25 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "46:" // Height 4: no bias "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x22, x13, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21]\n" + "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "b 48f\n" "47:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -1005,14 +1005,14 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "49:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1022,108 +1022,108 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "b 51f\n" "50:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "51:" // Height 4: input setup done "cmp x27, #0x4\n" "ble 53f\n" "52:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z3.s }, p0/Z, [x26]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z25.s }, p5/Z, [x12]\n" + "ld1w { z24.s }, p5/Z, [x11]\n" + "fmla z8.s, z25.s, z3.s[0]\n" + "fmla z12.s, z25.s, z2.s[0]\n" + "fmla z16.s, z25.s, z1.s[0]\n" + "fmla z20.s, z25.s, z0.s[0]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" "add x25, x25, #0x10\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" + "fmla z9.s, z24.s, z3.s[0]\n" + "fmla z13.s, z24.s, z2.s[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z17.s, z24.s, z1.s[0]\n" + "fmla z21.s, z24.s, z0.s[0]\n" + "ld1w { z24.s }, p5/Z, [x9]\n" + "fmla z10.s, z25.s, z3.s[0]\n" + "fmla z14.s, z25.s, z2.s[0]\n" + "fmla z18.s, z25.s, z1.s[0]\n" + "fmla z22.s, z25.s, z0.s[0]\n" + "ld1w { z25.s }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[0]\n" + "fmla z15.s, z24.s, z2.s[0]\n" + "fmla z19.s, z24.s, z1.s[0]\n" + "fmla z23.s, z24.s, z0.s[0]\n" + "ld1w { z24.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[1]\n" + "fmla z12.s, z25.s, z2.s[1]\n" + "fmla z16.s, z25.s, z1.s[1]\n" + "fmla z20.s, z25.s, z0.s[1]\n" + "ld1w { z25.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[1]\n" + "fmla z13.s, z24.s, z2.s[1]\n" + "fmla z17.s, z24.s, z1.s[1]\n" + "fmla z21.s, z24.s, z0.s[1]\n" + "ld1w { z24.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.s, z25.s, z3.s[1]\n" + "fmla z14.s, z25.s, z2.s[1]\n" + "fmla z18.s, z25.s, z1.s[1]\n" + "fmla z22.s, z25.s, z0.s[1]\n" + "ld1w { z25.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[1]\n" + "fmla z15.s, z24.s, z2.s[1]\n" + "fmla z19.s, z24.s, z1.s[1]\n" + "fmla z23.s, z24.s, z0.s[1]\n" + "ld1w { z24.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[2]\n" + "fmla z12.s, z25.s, z2.s[2]\n" + "fmla z16.s, z25.s, z1.s[2]\n" + "fmla z20.s, z25.s, z0.s[2]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[2]\n" + "fmla z13.s, z24.s, z2.s[2]\n" + "fmla z17.s, z24.s, z1.s[2]\n" + "fmla z21.s, z24.s, z0.s[2]\n" + "ld1w { z24.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.s, z25.s, z3.s[2]\n" + "fmla z14.s, z25.s, z2.s[2]\n" + "fmla z18.s, z25.s, z1.s[2]\n" + "fmla z22.s, z25.s, z0.s[2]\n" + "ld1w { z25.s }, p5/Z, [x12, #3, MUL VL]\n" "addvl x12, x12, #4\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[2]\n" + "fmla z15.s, z24.s, z2.s[2]\n" + "fmla z19.s, z24.s, z1.s[2]\n" + "fmla z23.s, z24.s, z0.s[2]\n" + "ld1w { z24.s }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[3]\n" + "fmla z12.s, z25.s, z2.s[3]\n" + "fmla z16.s, z25.s, z1.s[3]\n" + "fmla z20.s, z25.s, z0.s[3]\n" + "ld1w { z25.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[3]\n" + "fmla z13.s, z24.s, z2.s[3]\n" + "fmla z17.s, z24.s, z1.s[3]\n" + "fmla z21.s, z24.s, z0.s[3]\n" + "ld1w { z24.s }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z10.s, z25.s, z3.s[3]\n" + "fmla z14.s, z25.s, z2.s[3]\n" + "fmla z18.s, z25.s, z1.s[3]\n" + "fmla z22.s, z25.s, z0.s[3]\n" + "fmla z11.s, z24.s, z3.s[3]\n" + "fmla z15.s, z24.s, z2.s[3]\n" + "fmla z19.s, z24.s, z1.s[3]\n" + "fmla z23.s, z24.s, z0.s[3]\n" "bgt 52b\n" "53:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -1132,107 +1132,107 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "subs x27, x27, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z25.s }, p5/Z, [x12]\n" + "ld1w { z24.s }, p5/Z, [x11]\n" + "fmla z8.s, z25.s, z0.s[0]\n" + "fmla z12.s, z25.s, z1.s[0]\n" + "fmla z16.s, z25.s, z2.s[0]\n" + "fmla z20.s, z25.s, z3.s[0]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" + "fmla z9.s, z24.s, z0.s[0]\n" + "fmla z13.s, z24.s, z1.s[0]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z24.s, z2.s[0]\n" + "fmla z21.s, z24.s, z3.s[0]\n" + "ld1w { z24.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z10.s, z25.s, z0.s[0]\n" + "fmla z14.s, z25.s, z1.s[0]\n" + "fmla z18.s, z25.s, z2.s[0]\n" + "fmla z22.s, z25.s, z3.s[0]\n" + "fmla z11.s, z24.s, z0.s[0]\n" + "fmla z15.s, z24.s, z1.s[0]\n" + "fmla z19.s, z24.s, z2.s[0]\n" + "fmla z23.s, z24.s, z3.s[0]\n" "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z25.s }, p5/Z, [x12]\n" + "ld1w { z24.s }, p5/Z, [x11]\n" + "fmla z8.s, z25.s, z0.s[1]\n" + "fmla z12.s, z25.s, z1.s[1]\n" + "fmla z16.s, z25.s, z2.s[1]\n" + "fmla z20.s, z25.s, z3.s[1]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z9.s, z24.s, z0.s[1]\n" + "fmla z13.s, z24.s, z1.s[1]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z24.s, z2.s[1]\n" + "fmla z21.s, z24.s, z3.s[1]\n" + "ld1w { z24.s }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z10.s, z25.s, z0.s[1]\n" + "fmla z14.s, z25.s, z1.s[1]\n" "addvl x9, x9, #1\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z18.s, z25.s, z2.s[1]\n" + "fmla z22.s, z25.s, z3.s[1]\n" + "fmla z11.s, z24.s, z0.s[1]\n" + "fmla z15.s, z24.s, z1.s[1]\n" + "fmla z19.s, z24.s, z2.s[1]\n" + "fmla z23.s, z24.s, z3.s[1]\n" "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z25.s }, p5/Z, [x12]\n" + "ld1w { z24.s }, p5/Z, [x11]\n" + "fmla z8.s, z25.s, z0.s[2]\n" + "fmla z12.s, z25.s, z1.s[2]\n" + "fmla z16.s, z25.s, z2.s[2]\n" + "fmla z20.s, z25.s, z3.s[2]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" "subs x27, x27, #0x1\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z9.s, z24.s, z0.s[2]\n" + "fmla z13.s, z24.s, z1.s[2]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z24.s, z2.s[2]\n" + "fmla z21.s, z24.s, z3.s[2]\n" + "ld1w { z24.s }, p5/Z, [x9]\n" "addvl x10, x10, #1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z10.s, z25.s, z0.s[2]\n" + "fmla z14.s, z25.s, z1.s[2]\n" "addvl x9, x9, #1\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z18.s, z25.s, z2.s[2]\n" + "fmla z22.s, z25.s, z3.s[2]\n" + "fmla z11.s, z24.s, z0.s[2]\n" + "fmla z15.s, z24.s, z1.s[2]\n" + "fmla z19.s, z24.s, z2.s[2]\n" + "fmla z23.s, z24.s, z3.s[2]\n" "ble 54f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "ld1w { z25.s }, p5/Z, [x12]\n" + "ld1w { z24.s }, p5/Z, [x11]\n" + "fmla z8.s, z25.s, z0.s[3]\n" + "fmla z12.s, z25.s, z1.s[3]\n" + "fmla z16.s, z25.s, z2.s[3]\n" + "fmla z20.s, z25.s, z3.s[3]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" "addvl x12, x12, #1\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" + "fmla z9.s, z24.s, z0.s[3]\n" + "fmla z13.s, z24.s, z1.s[3]\n" "addvl x11, x11, #1\n" "addvl x10, x10, #1\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z24.s, z2.s[3]\n" + "fmla z21.s, z24.s, z3.s[3]\n" + "ld1w { z24.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z10.s, z25.s, z0.s[3]\n" + "fmla z14.s, z25.s, z1.s[3]\n" + "fmla z18.s, z25.s, z2.s[3]\n" + "fmla z22.s, z25.s, z3.s[3]\n" + "fmla z11.s, z24.s, z0.s[3]\n" + "fmla z15.s, z24.s, z1.s[3]\n" + "fmla z19.s, z24.s, z2.s[3]\n" + "fmla z23.s, z24.s, z3.s[3]\n" "54:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1244,41 +1244,41 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 55f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z15.s, p5/M, z15.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmin z20.s, p5/M, z20.s, z25.s\n" + "fmin z21.s, p5/M, z21.s, z25.s\n" + "fmin z22.s, p5/M, z22.s, z25.s\n" + "fmin z23.s, p5/M, z23.s, z25.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z15.s, p5/M, z15.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" + "fmax z20.s, p5/M, z20.s, z24.s\n" + "fmax z21.s, p5/M, z21.s, z24.s\n" + "fmax z22.s, p5/M, z22.s, z24.s\n" + "fmax z23.s, p5/M, z23.s, z24.s\n" "55:" // Height 4: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -1363,30 +1363,30 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "60:" // Height 5: no bias "tbz %x[flags], #0, 61f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x13]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x13, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x13]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" "b 62f\n" "61:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1414,15 +1414,15 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "63:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 64f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 65f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1433,127 +1433,127 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "b 65f\n" "64:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "65:" // Height 5: input setup done "cmp x27, #0x4\n" "ble 67f\n" "66:" // Height 5: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "ld1rqw { z3.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1rqw { z0.s }, p0/Z, [x22]\n" + "ld1w { z29.s }, p5/Z, [x12]\n" + "fmla z8.s, z29.s, z4.s[0]\n" + "fmla z12.s, z29.s, z3.s[0]\n" + "ld1w { z28.s }, p5/Z, [x11]\n" + "fmla z16.s, z29.s, z2.s[0]\n" + "fmla z20.s, z29.s, z1.s[0]\n" "add x25, x25, #0x10\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z29.s, z0.s[0]\n" + "fmla z9.s, z28.s, z4.s[0]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" "add x24, x24, #0x10\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z13.s, z28.s, z3.s[0]\n" + "fmla z17.s, z28.s, z2.s[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z21.s, z28.s, z1.s[0]\n" + "fmla z25.s, z28.s, z0.s[0]\n" + "ld1w { z28.s }, p5/Z, [x9]\n" + "fmla z10.s, z29.s, z4.s[0]\n" + "fmla z14.s, z29.s, z3.s[0]\n" + "fmla z18.s, z29.s, z2.s[0]\n" + "fmla z22.s, z29.s, z1.s[0]\n" + "fmla z26.s, z29.s, z0.s[0]\n" + "fmla z11.s, z28.s, z4.s[0]\n" + "ld1w { z29.s }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[0]\n" + "fmla z19.s, z28.s, z2.s[0]\n" + "fmla z23.s, z28.s, z1.s[0]\n" + "fmla z27.s, z28.s, z0.s[0]\n" + "ld1w { z28.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[1]\n" + "fmla z12.s, z29.s, z3.s[1]\n" + "fmla z16.s, z29.s, z2.s[1]\n" + "fmla z20.s, z29.s, z1.s[1]\n" + "fmla z24.s, z29.s, z0.s[1]\n" + "fmla z9.s, z28.s, z4.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[1]\n" + "fmla z17.s, z28.s, z2.s[1]\n" + "fmla z21.s, z28.s, z1.s[1]\n" + "fmla z25.s, z28.s, z0.s[1]\n" + "ld1w { z28.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.s, z29.s, z4.s[1]\n" + "fmla z14.s, z29.s, z3.s[1]\n" + "fmla z18.s, z29.s, z2.s[1]\n" + "fmla z22.s, z29.s, z1.s[1]\n" + "fmla z26.s, z29.s, z0.s[1]\n" + "fmla z11.s, z28.s, z4.s[1]\n" + "ld1w { z29.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[1]\n" + "fmla z19.s, z28.s, z2.s[1]\n" + "fmla z23.s, z28.s, z1.s[1]\n" + "fmla z27.s, z28.s, z0.s[1]\n" + "ld1w { z28.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[2]\n" + "fmla z12.s, z29.s, z3.s[2]\n" + "fmla z16.s, z29.s, z2.s[2]\n" + "fmla z20.s, z29.s, z1.s[2]\n" + "fmla z24.s, z29.s, z0.s[2]\n" + "fmla z9.s, z28.s, z4.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[2]\n" + "fmla z17.s, z28.s, z2.s[2]\n" + "fmla z21.s, z28.s, z1.s[2]\n" + "fmla z25.s, z28.s, z0.s[2]\n" + "ld1w { z28.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.s, z29.s, z4.s[2]\n" + "fmla z14.s, z29.s, z3.s[2]\n" + "fmla z18.s, z29.s, z2.s[2]\n" + "fmla z22.s, z29.s, z1.s[2]\n" + "fmla z26.s, z29.s, z0.s[2]\n" + "fmla z11.s, z28.s, z4.s[2]\n" + "ld1w { z29.s }, p5/Z, [x12, #3, MUL VL]\n" "addvl x12, x12, #4\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[2]\n" + "fmla z19.s, z28.s, z2.s[2]\n" + "fmla z23.s, z28.s, z1.s[2]\n" + "fmla z27.s, z28.s, z0.s[2]\n" + "ld1w { z28.s }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[3]\n" + "fmla z12.s, z29.s, z3.s[3]\n" + "fmla z16.s, z29.s, z2.s[3]\n" + "fmla z20.s, z29.s, z1.s[3]\n" + "fmla z24.s, z29.s, z0.s[3]\n" + "fmla z9.s, z28.s, z4.s[3]\n" + "ld1w { z29.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[3]\n" + "fmla z17.s, z28.s, z2.s[3]\n" + "fmla z21.s, z28.s, z1.s[3]\n" + "fmla z25.s, z28.s, z0.s[3]\n" + "ld1w { z28.s }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z10.s, z29.s, z4.s[3]\n" + "fmla z14.s, z29.s, z3.s[3]\n" + "fmla z18.s, z29.s, z2.s[3]\n" + "fmla z22.s, z29.s, z1.s[3]\n" + "fmla z26.s, z29.s, z0.s[3]\n" + "fmla z11.s, z28.s, z4.s[3]\n" + "fmla z15.s, z28.s, z3.s[3]\n" + "fmla z19.s, z28.s, z2.s[3]\n" + "fmla z23.s, z28.s, z1.s[3]\n" + "fmla z27.s, z28.s, z0.s[3]\n" "bgt 66b\n" "67:" // Height 5: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -1563,123 +1563,123 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "ld1rqw { z2.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z29.s }, p5/Z, [x12]\n" + "fmla z8.s, z29.s, z0.s[0]\n" + "fmla z12.s, z29.s, z1.s[0]\n" + "ld1w { z28.s }, p5/Z, [x11]\n" + "fmla z16.s, z29.s, z2.s[0]\n" + "fmla z20.s, z29.s, z3.s[0]\n" "addvl x12, x12, #1\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z29.s, z4.s[0]\n" + "fmla z9.s, z28.s, z0.s[0]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z13.s, z28.s, z1.s[0]\n" + "fmla z17.s, z28.s, z2.s[0]\n" "addvl x10, x10, #1\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z21.s, z28.s, z3.s[0]\n" + "fmla z25.s, z28.s, z4.s[0]\n" + "ld1w { z28.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" + "fmla z10.s, z29.s, z0.s[0]\n" + "fmla z14.s, z29.s, z1.s[0]\n" + "fmla z18.s, z29.s, z2.s[0]\n" + "fmla z22.s, z29.s, z3.s[0]\n" + "fmla z26.s, z29.s, z4.s[0]\n" + "fmla z11.s, z28.s, z0.s[0]\n" + "fmla z15.s, z28.s, z1.s[0]\n" + "fmla z19.s, z28.s, z2.s[0]\n" + "fmla z23.s, z28.s, z3.s[0]\n" + "fmla z27.s, z28.s, z4.s[0]\n" "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z29.s }, p5/Z, [x12]\n" + "ld1w { z28.s }, p5/Z, [x11]\n" + "fmla z8.s, z29.s, z0.s[1]\n" + "fmla z12.s, z29.s, z1.s[1]\n" + "fmla z16.s, z29.s, z2.s[1]\n" + "fmla z20.s, z29.s, z3.s[1]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z29.s, z4.s[1]\n" + "fmla z9.s, z28.s, z0.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" + "fmla z13.s, z28.s, z1.s[1]\n" + "fmla z17.s, z28.s, z2.s[1]\n" "addvl x10, x10, #1\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z21.s, z28.s, z3.s[1]\n" + "fmla z25.s, z28.s, z4.s[1]\n" + "ld1w { z28.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" + "fmla z10.s, z29.s, z0.s[1]\n" + "fmla z14.s, z29.s, z1.s[1]\n" + "fmla z18.s, z29.s, z2.s[1]\n" + "fmla z22.s, z29.s, z3.s[1]\n" + "fmla z26.s, z29.s, z4.s[1]\n" + "fmla z11.s, z28.s, z0.s[1]\n" + "fmla z15.s, z28.s, z1.s[1]\n" + "fmla z19.s, z28.s, z2.s[1]\n" + "fmla z23.s, z28.s, z3.s[1]\n" + "fmla z27.s, z28.s, z4.s[1]\n" "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z29.s }, p5/Z, [x12]\n" + "ld1w { z28.s }, p5/Z, [x11]\n" + "fmla z8.s, z29.s, z0.s[2]\n" + "fmla z12.s, z29.s, z1.s[2]\n" + "fmla z16.s, z29.s, z2.s[2]\n" + "fmla z20.s, z29.s, z3.s[2]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z29.s, z4.s[2]\n" + "fmla z9.s, z28.s, z0.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" + "fmla z13.s, z28.s, z1.s[2]\n" + "fmla z17.s, z28.s, z2.s[2]\n" "addvl x10, x10, #1\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z21.s, z28.s, z3.s[2]\n" + "fmla z25.s, z28.s, z4.s[2]\n" + "ld1w { z28.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" + "fmla z10.s, z29.s, z0.s[2]\n" + "fmla z14.s, z29.s, z1.s[2]\n" + "fmla z18.s, z29.s, z2.s[2]\n" + "fmla z22.s, z29.s, z3.s[2]\n" + "fmla z26.s, z29.s, z4.s[2]\n" + "fmla z11.s, z28.s, z0.s[2]\n" + "fmla z15.s, z28.s, z1.s[2]\n" + "fmla z19.s, z28.s, z2.s[2]\n" + "fmla z23.s, z28.s, z3.s[2]\n" + "fmla z27.s, z28.s, z4.s[2]\n" "ble 68f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" + "ld1w { z29.s }, p5/Z, [x12]\n" + "ld1w { z28.s }, p5/Z, [x11]\n" + "fmla z8.s, z29.s, z0.s[3]\n" + "fmla z12.s, z29.s, z1.s[3]\n" + "fmla z16.s, z29.s, z2.s[3]\n" + "fmla z20.s, z29.s, z3.s[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z29.s, z4.s[3]\n" + "fmla z9.s, z28.s, z0.s[3]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z13.s, z28.s, z1.s[3]\n" + "fmla z17.s, z28.s, z2.s[3]\n" + "fmla z21.s, z28.s, z3.s[3]\n" + "fmla z25.s, z28.s, z4.s[3]\n" + "ld1w { z28.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z10.s, z29.s, z0.s[3]\n" + "fmla z14.s, z29.s, z1.s[3]\n" + "fmla z18.s, z29.s, z2.s[3]\n" + "fmla z22.s, z29.s, z3.s[3]\n" + "fmla z26.s, z29.s, z4.s[3]\n" + "fmla z11.s, z28.s, z0.s[3]\n" + "fmla z15.s, z28.s, z1.s[3]\n" + "fmla z19.s, z28.s, z2.s[3]\n" + "fmla z23.s, z28.s, z3.s[3]\n" + "fmla z27.s, z28.s, z4.s[3]\n" "68:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1692,49 +1692,49 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 69f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z29.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "ld1rw { z28.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z29.s\n" + "fmin z9.s, p5/M, z9.s, z29.s\n" + "fmin z10.s, p5/M, z10.s, z29.s\n" + "fmin z11.s, p5/M, z11.s, z29.s\n" + "fmin z12.s, p5/M, z12.s, z29.s\n" + "fmin z13.s, p5/M, z13.s, z29.s\n" + "fmin z14.s, p5/M, z14.s, z29.s\n" + "fmin z15.s, p5/M, z15.s, z29.s\n" + "fmin z16.s, p5/M, z16.s, z29.s\n" + "fmin z17.s, p5/M, z17.s, z29.s\n" + "fmin z18.s, p5/M, z18.s, z29.s\n" + "fmin z19.s, p5/M, z19.s, z29.s\n" + "fmin z20.s, p5/M, z20.s, z29.s\n" + "fmin z21.s, p5/M, z21.s, z29.s\n" + "fmin z22.s, p5/M, z22.s, z29.s\n" + "fmin z23.s, p5/M, z23.s, z29.s\n" + "fmin z24.s, p5/M, z24.s, z29.s\n" + "fmin z25.s, p5/M, z25.s, z29.s\n" + "fmin z26.s, p5/M, z26.s, z29.s\n" + "fmin z27.s, p5/M, z27.s, z29.s\n" + "fmax z8.s, p5/M, z8.s, z28.s\n" + "fmax z9.s, p5/M, z9.s, z28.s\n" + "fmax z10.s, p5/M, z10.s, z28.s\n" + "fmax z11.s, p5/M, z11.s, z28.s\n" + "fmax z12.s, p5/M, z12.s, z28.s\n" + "fmax z13.s, p5/M, z13.s, z28.s\n" + "fmax z14.s, p5/M, z14.s, z28.s\n" + "fmax z15.s, p5/M, z15.s, z28.s\n" + "fmax z16.s, p5/M, z16.s, z28.s\n" + "fmax z17.s, p5/M, z17.s, z28.s\n" + "fmax z18.s, p5/M, z18.s, z28.s\n" + "fmax z19.s, p5/M, z19.s, z28.s\n" + "fmax z20.s, p5/M, z20.s, z28.s\n" + "fmax z21.s, p5/M, z21.s, z28.s\n" + "fmax z22.s, p5/M, z22.s, z28.s\n" + "fmax z23.s, p5/M, z23.s, z28.s\n" + "fmax z24.s, p5/M, z24.s, z28.s\n" + "fmax z25.s, p5/M, z25.s, z28.s\n" + "fmax z26.s, p5/M, z26.s, z28.s\n" + "fmax z27.s, p5/M, z27.s, z28.s\n" "69:" // Height 5: No activation "st1w { z8.s }, p4, [x13]\n" "st1w { z9.s }, p3, [x13, #1, MUL VL]\n" @@ -1830,35 +1830,35 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "74:" // Height 6: no bias "tbz %x[flags], #0, 75f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x13, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x13]\n" + "add x24, x13, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x13]\n" "add x22, x23, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n" - "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "b 76f\n" "75:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1890,16 +1890,16 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "77:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 78f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 79f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1911,146 +1911,146 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "b 79f\n" "78:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "79:" // Height 6: input setup done "cmp x27, #0x4\n" "ble 81f\n" "80:" // Height 6: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z7.s }, p0/Z, [x26]\n" + "ld1rqw { z6.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1rqw { z5.s }, p0/Z, [x21]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z1.s }, p5/Z, [x12]\n" + "ld1w { z0.s }, p5/Z, [x11]\n" + "fmla z8.s, z1.s, z7.s[0]\n" + "fmla z12.s, z1.s, z6.s[0]\n" + "fmla z16.s, z1.s, z5.s[0]\n" + "fmla z20.s, z1.s, z4.s[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z1.s, z3.s[0]\n" + "fmla z28.s, z1.s, z2.s[0]\n" + "ld1w { z1.s }, p5/Z, [x10]\n" "add x21, x21, #0x10\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z30.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x12, #1, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "fmla z31.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x11, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9, #1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z30.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x12, #2, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "fmla z31.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x11, #2, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z30.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x12, #3, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[0]\n" + "fmla z13.s, z0.s, z6.s[0]\n" + "fmla z17.s, z0.s, z5.s[0]\n" + "fmla z21.s, z0.s, z4.s[0]\n" + "fmla z25.s, z0.s, z3.s[0]\n" + "fmla z29.s, z0.s, z2.s[0]\n" + "ld1w { z0.s }, p5/Z, [x9]\n" + "fmla z10.s, z1.s, z7.s[0]\n" + "fmla z14.s, z1.s, z6.s[0]\n" + "fmla z18.s, z1.s, z5.s[0]\n" + "fmla z22.s, z1.s, z4.s[0]\n" + "fmla z26.s, z1.s, z3.s[0]\n" + "fmla z30.s, z1.s, z2.s[0]\n" + "ld1w { z1.s }, p5/Z, [x12, #1, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[0]\n" + "fmla z15.s, z0.s, z6.s[0]\n" + "fmla z19.s, z0.s, z5.s[0]\n" + "fmla z23.s, z0.s, z4.s[0]\n" + "fmla z27.s, z0.s, z3.s[0]\n" + "fmla z31.s, z0.s, z2.s[0]\n" + "ld1w { z0.s }, p5/Z, [x11, #1, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[1]\n" + "fmla z12.s, z1.s, z6.s[1]\n" + "fmla z16.s, z1.s, z5.s[1]\n" + "fmla z20.s, z1.s, z4.s[1]\n" + "fmla z24.s, z1.s, z3.s[1]\n" + "fmla z28.s, z1.s, z2.s[1]\n" + "ld1w { z1.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[1]\n" + "fmla z13.s, z0.s, z6.s[1]\n" + "fmla z17.s, z0.s, z5.s[1]\n" + "fmla z21.s, z0.s, z4.s[1]\n" + "fmla z25.s, z0.s, z3.s[1]\n" + "fmla z29.s, z0.s, z2.s[1]\n" + "ld1w { z0.s }, p5/Z, [x9, #1, MUL VL]\n" + "fmla z10.s, z1.s, z7.s[1]\n" + "fmla z14.s, z1.s, z6.s[1]\n" + "fmla z18.s, z1.s, z5.s[1]\n" + "fmla z22.s, z1.s, z4.s[1]\n" + "fmla z26.s, z1.s, z3.s[1]\n" + "fmla z30.s, z1.s, z2.s[1]\n" + "ld1w { z1.s }, p5/Z, [x12, #2, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[1]\n" + "fmla z15.s, z0.s, z6.s[1]\n" + "fmla z19.s, z0.s, z5.s[1]\n" + "fmla z23.s, z0.s, z4.s[1]\n" + "fmla z27.s, z0.s, z3.s[1]\n" + "fmla z31.s, z0.s, z2.s[1]\n" + "ld1w { z0.s }, p5/Z, [x11, #2, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[2]\n" + "fmla z12.s, z1.s, z6.s[2]\n" + "fmla z16.s, z1.s, z5.s[2]\n" + "fmla z20.s, z1.s, z4.s[2]\n" + "fmla z24.s, z1.s, z3.s[2]\n" + "fmla z28.s, z1.s, z2.s[2]\n" + "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[2]\n" + "fmla z13.s, z0.s, z6.s[2]\n" + "fmla z17.s, z0.s, z5.s[2]\n" + "fmla z21.s, z0.s, z4.s[2]\n" + "fmla z25.s, z0.s, z3.s[2]\n" + "fmla z29.s, z0.s, z2.s[2]\n" + "ld1w { z0.s }, p5/Z, [x9, #2, MUL VL]\n" + "fmla z10.s, z1.s, z7.s[2]\n" + "fmla z14.s, z1.s, z6.s[2]\n" + "fmla z18.s, z1.s, z5.s[2]\n" + "fmla z22.s, z1.s, z4.s[2]\n" + "fmla z26.s, z1.s, z3.s[2]\n" + "fmla z30.s, z1.s, z2.s[2]\n" + "ld1w { z1.s }, p5/Z, [x12, #3, MUL VL]\n" "addvl x12, x12, #4\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "fmla z31.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x11, #3, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[2]\n" + "fmla z15.s, z0.s, z6.s[2]\n" + "fmla z19.s, z0.s, z5.s[2]\n" + "fmla z23.s, z0.s, z4.s[2]\n" + "fmla z27.s, z0.s, z3.s[2]\n" + "fmla z31.s, z0.s, z2.s[2]\n" + "ld1w { z0.s }, p5/Z, [x11, #3, MUL VL]\n" "addvl x11, x11, #4\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[3]\n" + "fmla z12.s, z1.s, z6.s[3]\n" + "fmla z16.s, z1.s, z5.s[3]\n" + "fmla z20.s, z1.s, z4.s[3]\n" + "fmla z24.s, z1.s, z3.s[3]\n" + "fmla z28.s, z1.s, z2.s[3]\n" + "ld1w { z1.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9, #3, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[3]\n" + "fmla z13.s, z0.s, z6.s[3]\n" + "fmla z17.s, z0.s, z5.s[3]\n" + "fmla z21.s, z0.s, z4.s[3]\n" + "fmla z25.s, z0.s, z3.s[3]\n" + "fmla z29.s, z0.s, z2.s[3]\n" + "ld1w { z0.s }, p5/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z30.s, z6.s, z5.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" - "fmla z31.s, z7.s, z5.s[3]\n" + "fmla z10.s, z1.s, z7.s[3]\n" + "fmla z14.s, z1.s, z6.s[3]\n" + "fmla z18.s, z1.s, z5.s[3]\n" + "fmla z22.s, z1.s, z4.s[3]\n" + "fmla z26.s, z1.s, z3.s[3]\n" + "fmla z30.s, z1.s, z2.s[3]\n" + "fmla z11.s, z0.s, z7.s[3]\n" + "fmla z15.s, z0.s, z6.s[3]\n" + "fmla z19.s, z0.s, z5.s[3]\n" + "fmla z23.s, z0.s, z4.s[3]\n" + "fmla z27.s, z0.s, z3.s[3]\n" + "fmla z31.s, z0.s, z2.s[3]\n" "bgt 80b\n" "81:" // Height 6: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -2061,139 +2061,139 @@ void sve_ffhybrid_fp32_mla_6x4VL ( "ld1rqw { z3.s }, p0/Z, [x23]\n" "ld1rqw { z4.s }, p0/Z, [x22]\n" "ld1rqw { z5.s }, p0/Z, [x21]\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z7.s }, p5/Z, [x12]\n" + "ld1w { z6.s }, p5/Z, [x11]\n" + "fmla z8.s, z7.s, z0.s[0]\n" + "fmla z12.s, z7.s, z1.s[0]\n" + "fmla z16.s, z7.s, z2.s[0]\n" + "fmla z20.s, z7.s, z3.s[0]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z7.s, z4.s[0]\n" + "fmla z28.s, z7.s, z5.s[0]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z9.s, z6.s, z0.s[0]\n" + "fmla z13.s, z6.s, z1.s[0]\n" + "fmla z17.s, z6.s, z2.s[0]\n" + "fmla z21.s, z6.s, z3.s[0]\n" + "fmla z25.s, z6.s, z4.s[0]\n" + "fmla z29.s, z6.s, z5.s[0]\n" + "ld1w { z6.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z30.s, z6.s, z5.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "fmla z31.s, z7.s, z5.s[0]\n" + "fmla z10.s, z7.s, z0.s[0]\n" + "fmla z14.s, z7.s, z1.s[0]\n" + "fmla z18.s, z7.s, z2.s[0]\n" + "fmla z22.s, z7.s, z3.s[0]\n" + "fmla z26.s, z7.s, z4.s[0]\n" + "fmla z30.s, z7.s, z5.s[0]\n" + "fmla z11.s, z6.s, z0.s[0]\n" + "fmla z15.s, z6.s, z1.s[0]\n" + "fmla z19.s, z6.s, z2.s[0]\n" + "fmla z23.s, z6.s, z3.s[0]\n" + "fmla z27.s, z6.s, z4.s[0]\n" + "fmla z31.s, z6.s, z5.s[0]\n" "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z7.s }, p5/Z, [x12]\n" + "ld1w { z6.s }, p5/Z, [x11]\n" + "fmla z8.s, z7.s, z0.s[1]\n" + "fmla z12.s, z7.s, z1.s[1]\n" + "fmla z16.s, z7.s, z2.s[1]\n" + "fmla z20.s, z7.s, z3.s[1]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z7.s, z4.s[1]\n" + "fmla z28.s, z7.s, z5.s[1]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" + "fmla z9.s, z6.s, z0.s[1]\n" + "fmla z13.s, z6.s, z1.s[1]\n" "addvl x10, x10, #1\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z6.s, z2.s[1]\n" + "fmla z21.s, z6.s, z3.s[1]\n" + "fmla z25.s, z6.s, z4.s[1]\n" + "fmla z29.s, z6.s, z5.s[1]\n" + "ld1w { z6.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z30.s, z6.s, z5.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "fmla z31.s, z7.s, z5.s[1]\n" + "fmla z10.s, z7.s, z0.s[1]\n" + "fmla z14.s, z7.s, z1.s[1]\n" + "fmla z18.s, z7.s, z2.s[1]\n" + "fmla z22.s, z7.s, z3.s[1]\n" + "fmla z26.s, z7.s, z4.s[1]\n" + "fmla z30.s, z7.s, z5.s[1]\n" + "fmla z11.s, z6.s, z0.s[1]\n" + "fmla z15.s, z6.s, z1.s[1]\n" + "fmla z19.s, z6.s, z2.s[1]\n" + "fmla z23.s, z6.s, z3.s[1]\n" + "fmla z27.s, z6.s, z4.s[1]\n" + "fmla z31.s, z6.s, z5.s[1]\n" "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z7.s }, p5/Z, [x12]\n" + "ld1w { z6.s }, p5/Z, [x11]\n" + "fmla z8.s, z7.s, z0.s[2]\n" + "fmla z12.s, z7.s, z1.s[2]\n" + "fmla z16.s, z7.s, z2.s[2]\n" + "fmla z20.s, z7.s, z3.s[2]\n" "subs x27, x27, #0x1\n" "addvl x12, x12, #1\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z7.s, z4.s[2]\n" + "fmla z28.s, z7.s, z5.s[2]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" "addvl x11, x11, #1\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" + "fmla z9.s, z6.s, z0.s[2]\n" + "fmla z13.s, z6.s, z1.s[2]\n" "addvl x10, x10, #1\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z17.s, z6.s, z2.s[2]\n" + "fmla z21.s, z6.s, z3.s[2]\n" + "fmla z25.s, z6.s, z4.s[2]\n" + "fmla z29.s, z6.s, z5.s[2]\n" + "ld1w { z6.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z30.s, z6.s, z5.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "fmla z31.s, z7.s, z5.s[2]\n" + "fmla z10.s, z7.s, z0.s[2]\n" + "fmla z14.s, z7.s, z1.s[2]\n" + "fmla z18.s, z7.s, z2.s[2]\n" + "fmla z22.s, z7.s, z3.s[2]\n" + "fmla z26.s, z7.s, z4.s[2]\n" + "fmla z30.s, z7.s, z5.s[2]\n" + "fmla z11.s, z6.s, z0.s[2]\n" + "fmla z15.s, z6.s, z1.s[2]\n" + "fmla z19.s, z6.s, z2.s[2]\n" + "fmla z23.s, z6.s, z3.s[2]\n" + "fmla z27.s, z6.s, z4.s[2]\n" + "fmla z31.s, z6.s, z5.s[2]\n" "ble 82f\n" - "ld1w { z6.s }, p5/Z, [x12]\n" - "ld1w { z7.s }, p5/Z, [x11]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" + "ld1w { z7.s }, p5/Z, [x12]\n" + "ld1w { z6.s }, p5/Z, [x11]\n" + "fmla z8.s, z7.s, z0.s[3]\n" + "fmla z12.s, z7.s, z1.s[3]\n" + "fmla z16.s, z7.s, z2.s[3]\n" + "fmla z20.s, z7.s, z3.s[3]\n" "addvl x12, x12, #1\n" "addvl x11, x11, #1\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" + "fmla z24.s, z7.s, z4.s[3]\n" + "fmla z28.s, z7.s, z5.s[3]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" "addvl x10, x10, #1\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x9]\n" + "fmla z9.s, z6.s, z0.s[3]\n" + "fmla z13.s, z6.s, z1.s[3]\n" + "fmla z17.s, z6.s, z2.s[3]\n" + "fmla z21.s, z6.s, z3.s[3]\n" + "fmla z25.s, z6.s, z4.s[3]\n" + "fmla z29.s, z6.s, z5.s[3]\n" + "ld1w { z6.s }, p5/Z, [x9]\n" "addvl x9, x9, #1\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z30.s, z6.s, z5.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" - "fmla z31.s, z7.s, z5.s[3]\n" + "fmla z10.s, z7.s, z0.s[3]\n" + "fmla z14.s, z7.s, z1.s[3]\n" + "fmla z18.s, z7.s, z2.s[3]\n" + "fmla z22.s, z7.s, z3.s[3]\n" + "fmla z26.s, z7.s, z4.s[3]\n" + "fmla z30.s, z7.s, z5.s[3]\n" + "fmla z11.s, z6.s, z0.s[3]\n" + "fmla z15.s, z6.s, z1.s[3]\n" + "fmla z19.s, z6.s, z2.s[3]\n" + "fmla z23.s, z6.s, z3.s[3]\n" + "fmla z27.s, z6.s, z4.s[3]\n" + "fmla z31.s, z6.s, z5.s[3]\n" "82:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2307,4 +2307,4 @@ void sve_ffhybrid_fp32_mla_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp index 3ee3e31206..887d78e1de 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp index 36fc9d75ca..57f42cce77 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp @@ -174,22 +174,22 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "b 6f\n" "4:" // Height 1: no bias "tbz %x[flags], #0, 5f\n" - "ld1w { z9.s }, p6/Z, [x13]\n" - "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x13]\n" + "ld1w { z20.s }, p5/Z, [x13, #1, MUL VL]\n" + "zip1 z8.d, z21.d, z14.d\n" + "zip2 z14.d, z21.d, z14.d\n" + "ld1w { z23.s }, p4/Z, [x13, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n" + "zip1 z9.d, z20.d, z15.d\n" + "zip2 z15.d, z20.d, z15.d\n" + "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" + "zip1 z10.d, z23.d, z16.d\n" + "zip2 z16.d, z23.d, z16.d\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "zip1 z12.d, z21.d, z18.d\n" + "zip2 z18.d, z21.d, z18.d\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" "b 6f\n" @@ -211,11 +211,11 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "7:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 8f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 9f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -227,35 +227,35 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "ble 11f\n" "10:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z21.h }, p7/Z, [x12]\n" + "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n" + ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n" + ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x11]\n" + "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x10]\n" + "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n" + ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x9]\n" + "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z23.h }, p7/Z, [x28]\n" + "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "ld1h { z21.h }, p7/Z, [x27]\n" + "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "add x24, x24, #0x10\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" @@ -266,33 +266,33 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "bgt 10b\n" "11:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + "ld1rqw { z22.s }, p0/Z, [x24]\n" + ".inst 0x658abed6 // bfcvt z22.h, p7/M, z22.s\n" + "uzp1 z22.h, z22.h, z22.h\n" + "ld1h { z21.h }, p7/Z, [x12]\n" + "ld1h { z20.h }, p7/Z, [x12, #1, MUL VL]\n" + ".inst 0x6475e6c8 // bfmmla z8.s, z22.h, z21.h\n" + ".inst 0x6474e6ce // bfmmla z14.s, z22.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x11]\n" + "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6475e6c9 // bfmmla z9.s, z22.h, z21.h\n" + ".inst 0x6474e6cf // bfmmla z15.s, z22.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x10]\n" + "ld1h { z20.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6475e6ca // bfmmla z10.s, z22.h, z21.h\n" + ".inst 0x6474e6d0 // bfmmla z16.s, z22.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x9]\n" + "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6475e6cb // bfmmla z11.s, z22.h, z21.h\n" + ".inst 0x6474e6d1 // bfmmla z17.s, z22.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28]\n" + "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6475e6cc // bfmmla z12.s, z22.h, z21.h\n" + ".inst 0x6474e6d2 // bfmmla z18.s, z22.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x27]\n" + "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6475e6cd // bfmmla z13.s, z22.h, z21.h\n" + ".inst 0x6474e6d3 // bfmmla z19.s, z22.h, z20.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" "addvl x10, x10, #2\n" @@ -312,21 +312,21 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "uzp1 z13.d, z13.d, z19.d\n" "tbz %x[flags], #1, 13f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" + "ld1rw { z21.s }, p7/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" + "ld1rw { z20.s }, p7/Z, [x20]\n" + "fmin z8.s, p7/M, z8.s, z21.s\n" + "fmin z9.s, p7/M, z9.s, z21.s\n" + "fmin z10.s, p7/M, z10.s, z21.s\n" + "fmin z11.s, p7/M, z11.s, z21.s\n" + "fmin z12.s, p7/M, z12.s, z21.s\n" + "fmin z13.s, p7/M, z13.s, z21.s\n" + "fmax z8.s, p7/M, z8.s, z20.s\n" + "fmax z9.s, p7/M, z9.s, z20.s\n" + "fmax z10.s, p7/M, z10.s, z20.s\n" + "fmax z11.s, p7/M, z11.s, z20.s\n" + "fmax z12.s, p7/M, z12.s, z20.s\n" + "fmax z13.s, p7/M, z13.s, z20.s\n" "13:" // Height 1: No activation "st1w { z8.s }, p6, [x13]\n" "st1w { z9.s }, p5, [x13, #1, MUL VL]\n" @@ -413,29 +413,29 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "18:" // Height 2: no bias "tbz %x[flags], #0, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x13, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x13]\n" - "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n" + "add x20, x13, x20, LSL #2\n" + "ld1w { z16.s }, p6/Z, [x13]\n" + "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x13, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" + "ld1w { z14.s }, p6/Z, [x20]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "zip1 z12.d, z21.d, z18.d\n" + "zip2 z18.d, z21.d, z18.d\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" "b 20f\n" @@ -457,12 +457,12 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "21:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 22f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -470,45 +470,45 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "b 23f\n" "22:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "23:" // Height 2: input setup done "cmp x25, #0x4\n" "ble 25f\n" "24:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + "ld1rqw { z20.s }, p0/Z, [x23]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z23.h }, p7/Z, [x12]\n" + "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n" + "uzp1 z20.h, z20.h, z20.h\n" + "trn1 z24.d, z24.d, z20.d\n" + "ld1h { z21.h }, p7/Z, [x11]\n" + "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n" + ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x10]\n" + "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x9]\n" + "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n" + ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28]\n" + "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x27]\n" + "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "addvl x12, x12, #2\n" @@ -520,39 +520,39 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "bgt 24b\n" "25:" // Height 2: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + "ld1rqw { z20.s }, p0/Z, [x23]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z23.h }, p7/Z, [x12]\n" + "ld1h { z22.h }, p7/Z, [x12, #1, MUL VL]\n" + "uzp1 z20.h, z20.h, z20.h\n" + "trn1 z24.d, z24.d, z20.d\n" + "ld1h { z21.h }, p7/Z, [x11]\n" + "ld1h { z20.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n" + ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x10]\n" + "ld1h { z22.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x9]\n" + "ld1h { z20.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n" + ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28]\n" + "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x27]\n" + "ld1h { z20.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" "addvl x12, x12, #2\n" "addvl x11, x11, #2\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" "addvl x28, x28, #2\n" @@ -578,33 +578,33 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "uzp2 z13.d, z13.d, z19.d\n" "tbz %x[flags], #1, 27f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" + "ld1rw { z20.s }, p7/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z4.s, p7/M, z4.s, z1.s\n" - "fmin z14.s, p7/M, z14.s, z1.s\n" - "fmin z15.s, p7/M, z15.s, z1.s\n" - "fmin z16.s, p7/M, z16.s, z1.s\n" - "fmin z17.s, p7/M, z17.s, z1.s\n" - "fmin z18.s, p7/M, z18.s, z1.s\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmax z4.s, p7/M, z4.s, z0.s\n" - "fmax z14.s, p7/M, z14.s, z0.s\n" - "fmax z15.s, p7/M, z15.s, z0.s\n" - "fmax z16.s, p7/M, z16.s, z0.s\n" - "fmax z17.s, p7/M, z17.s, z0.s\n" - "fmax z18.s, p7/M, z18.s, z0.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" + "ld1rw { z19.s }, p7/Z, [x20]\n" + "fmin z4.s, p7/M, z4.s, z20.s\n" + "fmin z14.s, p7/M, z14.s, z20.s\n" + "fmin z15.s, p7/M, z15.s, z20.s\n" + "fmin z16.s, p7/M, z16.s, z20.s\n" + "fmin z17.s, p7/M, z17.s, z20.s\n" + "fmin z18.s, p7/M, z18.s, z20.s\n" + "fmin z8.s, p7/M, z8.s, z20.s\n" + "fmin z9.s, p7/M, z9.s, z20.s\n" + "fmin z10.s, p7/M, z10.s, z20.s\n" + "fmin z11.s, p7/M, z11.s, z20.s\n" + "fmin z12.s, p7/M, z12.s, z20.s\n" + "fmin z13.s, p7/M, z13.s, z20.s\n" + "fmax z4.s, p7/M, z4.s, z19.s\n" + "fmax z14.s, p7/M, z14.s, z19.s\n" + "fmax z15.s, p7/M, z15.s, z19.s\n" + "fmax z16.s, p7/M, z16.s, z19.s\n" + "fmax z17.s, p7/M, z17.s, z19.s\n" + "fmax z18.s, p7/M, z18.s, z19.s\n" + "fmax z8.s, p7/M, z8.s, z19.s\n" + "fmax z9.s, p7/M, z9.s, z19.s\n" + "fmax z10.s, p7/M, z10.s, z19.s\n" + "fmax z11.s, p7/M, z11.s, z19.s\n" + "fmax z12.s, p7/M, z12.s, z19.s\n" + "fmax z13.s, p7/M, z13.s, z19.s\n" "27:" // Height 2: No activation "st1w { z4.s }, p6, [x13]\n" "st1w { z14.s }, p5, [x13, #1, MUL VL]\n" @@ -709,38 +709,38 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "32:" // Height 3: no bias "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x13, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x13]\n" - "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n" + "add x21, x13, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z16.s }, p6/Z, [x13]\n" + "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n" + "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "ld1w { z21.s }, p6/Z, [x22]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n" - "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" - "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x21]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x20]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n" + "zip1 z12.d, z24.d, z18.d\n" + "zip2 z18.d, z24.d, z18.d\n" + "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" - "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n" "zip1 z20.d, z21.d, z26.d\n" "zip2 z26.d, z21.d, z26.d\n" "zip1 z21.d, z22.d, z27.d\n" @@ -751,8 +751,8 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "zip2 z29.d, z24.d, z29.d\n" "zip1 z24.d, z25.d, z30.d\n" "zip2 z30.d, z25.d, z30.d\n" - "zip1 z25.d, z4.d, z31.d\n" - "zip2 z31.d, z4.d, z31.d\n" + "zip1 z25.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 34f\n" "33:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -784,13 +784,13 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "35:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -799,125 +799,125 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "b 37f\n" "36:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "37:" // Height 3: input setup done "cmp x25, #0x4\n" "ble 39f\n" "38:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z3.h }, p7/Z, [x12]\n" "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n" + "ld1h { z1.h }, p7/Z, [x11]\n" + "trn1 z5.d, z5.d, z0.d\n" + "uzp1 z4.h, z4.h, z4.h\n" + "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n" + ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n" + ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x10]\n" "sub x25, x25, #0x4\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n" "cmp x25, #0x4\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" + ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x9]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n" "add x23, x23, #0x10\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" + ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n" + ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" "add x22, x22, #0x10\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" "addvl x12, x12, #2\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" + ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n" + ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x27]\n" "addvl x11, x11, #2\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n" "addvl x10, x10, #2\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n" + ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n" "addvl x9, x9, #2\n" "addvl x28, x28, #2\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n" "addvl x27, x27, #2\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "bgt 38b\n" "39:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z3.h }, p7/Z, [x12]\n" "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n" + "ld1h { z1.h }, p7/Z, [x11]\n" + "trn1 z5.d, z5.d, z0.d\n" + "uzp1 z4.h, z4.h, z4.h\n" + "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n" + ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n" + ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n" + ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x10]\n" "addvl x12, x12, #2\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" + ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x9]\n" "addvl x10, x10, #2\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n" "addvl x9, x9, #2\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n" + ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" + ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" "addvl x28, x28, #2\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n" + ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x27]\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n" "addvl x27, x27, #2\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n" + ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "40:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -946,45 +946,45 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "uzp1 z25.d, z25.d, z31.d\n" "tbz %x[flags], #1, 41f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" - "add x20, %x[args_ptr], %[offset_min]\n" "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z4.s, p7/M, z4.s, z1.s\n" - "fmin z14.s, p7/M, z14.s, z1.s\n" - "fmin z15.s, p7/M, z15.s, z1.s\n" - "fmin z16.s, p7/M, z16.s, z1.s\n" - "fmin z17.s, p7/M, z17.s, z1.s\n" - "fmin z18.s, p7/M, z18.s, z1.s\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmin z20.s, p7/M, z20.s, z1.s\n" - "fmin z21.s, p7/M, z21.s, z1.s\n" - "fmin z22.s, p7/M, z22.s, z1.s\n" - "fmin z23.s, p7/M, z23.s, z1.s\n" - "fmin z24.s, p7/M, z24.s, z1.s\n" - "fmin z25.s, p7/M, z25.s, z1.s\n" - "fmax z4.s, p7/M, z4.s, z0.s\n" - "fmax z14.s, p7/M, z14.s, z0.s\n" - "fmax z15.s, p7/M, z15.s, z0.s\n" - "fmax z16.s, p7/M, z16.s, z0.s\n" - "fmax z17.s, p7/M, z17.s, z0.s\n" - "fmax z18.s, p7/M, z18.s, z0.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" - "fmax z20.s, p7/M, z20.s, z0.s\n" - "fmax z21.s, p7/M, z21.s, z0.s\n" - "fmax z22.s, p7/M, z22.s, z0.s\n" - "fmax z23.s, p7/M, z23.s, z0.s\n" - "fmax z24.s, p7/M, z24.s, z0.s\n" - "fmax z25.s, p7/M, z25.s, z0.s\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1rw { z19.s }, p7/Z, [x20]\n" + "fmin z4.s, p7/M, z4.s, z0.s\n" + "fmin z14.s, p7/M, z14.s, z0.s\n" + "fmin z15.s, p7/M, z15.s, z0.s\n" + "fmin z16.s, p7/M, z16.s, z0.s\n" + "fmin z17.s, p7/M, z17.s, z0.s\n" + "fmin z18.s, p7/M, z18.s, z0.s\n" + "fmin z8.s, p7/M, z8.s, z0.s\n" + "fmin z9.s, p7/M, z9.s, z0.s\n" + "fmin z10.s, p7/M, z10.s, z0.s\n" + "fmin z11.s, p7/M, z11.s, z0.s\n" + "fmin z12.s, p7/M, z12.s, z0.s\n" + "fmin z13.s, p7/M, z13.s, z0.s\n" + "fmin z20.s, p7/M, z20.s, z0.s\n" + "fmin z21.s, p7/M, z21.s, z0.s\n" + "fmin z22.s, p7/M, z22.s, z0.s\n" + "fmin z23.s, p7/M, z23.s, z0.s\n" + "fmin z24.s, p7/M, z24.s, z0.s\n" + "fmin z25.s, p7/M, z25.s, z0.s\n" + "fmax z4.s, p7/M, z4.s, z19.s\n" + "fmax z14.s, p7/M, z14.s, z19.s\n" + "fmax z15.s, p7/M, z15.s, z19.s\n" + "fmax z16.s, p7/M, z16.s, z19.s\n" + "fmax z17.s, p7/M, z17.s, z19.s\n" + "fmax z18.s, p7/M, z18.s, z19.s\n" + "fmax z8.s, p7/M, z8.s, z19.s\n" + "fmax z9.s, p7/M, z9.s, z19.s\n" + "fmax z10.s, p7/M, z10.s, z19.s\n" + "fmax z11.s, p7/M, z11.s, z19.s\n" + "fmax z12.s, p7/M, z12.s, z19.s\n" + "fmax z13.s, p7/M, z13.s, z19.s\n" + "fmax z20.s, p7/M, z20.s, z19.s\n" + "fmax z21.s, p7/M, z21.s, z19.s\n" + "fmax z22.s, p7/M, z22.s, z19.s\n" + "fmax z23.s, p7/M, z23.s, z19.s\n" + "fmax z24.s, p7/M, z24.s, z19.s\n" + "fmax z25.s, p7/M, z25.s, z19.s\n" "41:" // Height 3: No activation "st1w { z4.s }, p6, [x13]\n" "st1w { z14.s }, p5, [x13, #1, MUL VL]\n" @@ -1098,57 +1098,57 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "46:" // Height 4: no bias "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x13, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x13]\n" + "add x22, x13, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p5/Z, [x13, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x13, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x13, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x13, #4, MUL VL]\n" + "ld1w { z16.s }, p6/Z, [x13]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p5/Z, [x13, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x13, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x13, #3, MUL VL]\n" + "ld1w { z24.s }, p2/Z, [x13, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x13, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "ld1w { z21.s }, p6/Z, [x22]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n" - "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" - "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x22]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x21]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n" + "zip1 z12.d, z24.d, z18.d\n" + "zip2 z18.d, z24.d, z18.d\n" + "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" - "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n" - "ld1w { z26.s }, p6/Z, [x21]\n" + "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z26.s }, p6/Z, [x20]\n" "zip1 z20.d, z21.d, z26.d\n" "zip2 z26.d, z21.d, z26.d\n" - "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n" "zip1 z21.d, z22.d, z27.d\n" "zip2 z27.d, z22.d, z27.d\n" - "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n" "zip1 z22.d, z23.d, z28.d\n" "zip2 z28.d, z23.d, z28.d\n" - "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n" "zip1 z23.d, z24.d, z29.d\n" "zip2 z29.d, z24.d, z29.d\n" "zip1 z24.d, z25.d, z30.d\n" "zip2 z30.d, z25.d, z30.d\n" - "zip1 z25.d, z4.d, z31.d\n" - "zip2 z31.d, z4.d, z31.d\n" + "zip1 z25.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 48f\n" "47:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -1180,14 +1180,14 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "49:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1197,135 +1197,135 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( "b 51f\n" "50:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "51:" // Height 4: input setup done "cmp x25, #0x4\n" "ble 53f\n" "52:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - "ld1rqw { z3.s }, p0/Z, [x21]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "ld1rqw { z7.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x23]\n" + ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z3.h }, p7/Z, [x12]\n" + "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z1.h }, p7/Z, [x11]\n" + "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n" + "uzp1 z4.h, z4.h, z4.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n" "sub x25, x25, #0x4\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" + "trn1 z5.d, z5.d, z4.d\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x10]\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n" "cmp x25, #0x4\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" + ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x9]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n" "add x23, x23, #0x10\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" "add x22, x22, #0x10\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" "add x21, x21, #0x10\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" + ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n" + ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x27]\n" "addvl x12, x12, #2\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n" "addvl x11, x11, #2\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n" + ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n" "addvl x10, x10, #2\n" "addvl x9, x9, #2\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n" + ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n" "addvl x28, x28, #2\n" "addvl x27, x27, #2\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n" + ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "bgt 52b\n" "53:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - "ld1rqw { z3.s }, p0/Z, [x21]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x12]\n" - "ld1h { z5.h }, p7/Z, [x12, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z6.h }, p7/Z, [x11]\n" - "ld1h { z7.h }, p7/Z, [x11, #1, MUL VL]\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "ld1rqw { z7.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x23]\n" + ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z3.h }, p7/Z, [x12]\n" + "ld1h { z2.h }, p7/Z, [x12, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z1.h }, p7/Z, [x11]\n" + "ld1h { z0.h }, p7/Z, [x11, #1, MUL VL]\n" + "uzp1 z4.h, z4.h, z4.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n" "addvl x12, x12, #2\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x10]\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x10, #1, MUL VL]\n" + "trn1 z5.d, z5.d, z4.d\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x10]\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x10, #1, MUL VL]\n" "addvl x11, x11, #2\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x9]\n" + ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x9]\n" "addvl x10, x10, #2\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x9, #1, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" + ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x9, #1, MUL VL]\n" + ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n" "addvl x9, x9, #2\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" + ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" "addvl x28, x28, #2\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x27]\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x27, #1, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n" + ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x27]\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x27, #1, MUL VL]\n" + ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n" "addvl x27, x27, #2\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n" + ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n" + ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n" + ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n" + ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n" + ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "54:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -1461,4 +1461,4 @@ void sve_ffhybrid_fp32bf16fp32_mmla_4x6VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp index 5792a7152d..d0ef531c33 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp index 7649336c36..576bd47039 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL/generic.cpp @@ -53,33 +53,33 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL( __asm__ __volatile__( "ptrue p0.b\n" "1:" // Height loop - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x24, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x26, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x25, %x[Apanel]\n" "2:" // Width loop - "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" "cntw x23, ALL, MUL #2\n" - "add x22, x26, x20, LSL #1\n" + "add x22, x24, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" - "cmp x25, x23\n" + "cmp x26, x23\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov %x[Apanel], x24\n" + "mov %x[Apanel], x25\n" "bgt 3f\n" "decw x23\n" - "cmp x25, x23\n" - "mov x21, x26\n" + "cmp x26, x23\n" + "mov x21, x24\n" "bgt 3f\n" - "mov x22, x26\n" + "mov x22, x24\n" "3:" // B setup done "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" - "ld1h { z4.h }, p0/Z, [x26]\n" + "ld1h { z4.h }, p0/Z, [x24]\n" "mov z11.b, #0x0\n" "mov z12.b, #0x0\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" @@ -88,13 +88,13 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL( "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" "mov z15.b, #0x0\n" "mov z16.b, #0x0\n" - "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n" + "ld1h { z5.h }, p0/Z, [x24, #1, MUL VL]\n" "mov z17.b, #0x0\n" "mov z18.b, #0x0\n" "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n" "mov z19.b, #0x0\n" "mov z20.b, #0x0\n" - "addvl x26, x26, #2\n" + "addvl x24, x24, #2\n" "mov z21.b, #0x0\n" "mov z22.b, #0x0\n" "add %x[Apanel], %x[Apanel], #0x30\n" @@ -109,83 +109,83 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL( "mov z31.b, #0x0\n" "blt 5f\n" "4:" // main loop head - "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n" - "ld1h { z6.h }, p0/Z, [x22]\n" + "ld1h { z7.h }, p0/Z, [x22]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n" - "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n" - "ld1h { z4.h }, p0/Z, [x21]\n" - "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n" + "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n" + ".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n" + "ld1h { z5.h }, p0/Z, [x21]\n" + "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n" + ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n" + ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n" "sub x20, x20, #0x2\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n" "cmp x20, #0x2\n" - ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n" - ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x26]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n" + ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n" + ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n" + "ld1h { z3.h }, p0/Z, [x24]\n" + ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n" + ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n" - ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n" + ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n" + ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n" - "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n" - ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n" + ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n" + "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n" + ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n" + ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n" + "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n" + ".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n" ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n" ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n" - "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n" - ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x21, #2, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n" + "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n" + ".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n" + "ld1h { z3.h }, p0/Z, [x21, #2, MUL VL]\n" "ld1h { z7.h }, p0/Z, [x21, #3, MUL VL]\n" - ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n" - ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n" - ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n" - ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n" + ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n" + ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n" "addvl x22, x22, #4\n" - ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n" - ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n" + ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n" + ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n" "addvl x21, x21, #4\n" - ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n" - ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n" - "ld1h { z4.h }, p0/Z, [x26, #2, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n" + ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n" + "ld1h { z4.h }, p0/Z, [x24, #2, MUL VL]\n" + ".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n" ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n" - ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n" + ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n" ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" - "ld1h { z5.h }, p0/Z, [x26, #3, MUL VL]\n" - ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n" - ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" + "ld1h { z5.h }, p0/Z, [x24, #3, MUL VL]\n" + ".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n" + ".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n" "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n" "add %x[Apanel], %x[Apanel], #0x80\n" - "addvl x26, x26, #4\n" + "addvl x24, x24, #4\n" "bge 4b\n" "5:" // main loop skip - "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" @@ -193,115 +193,115 @@ void sve_ffinterleaved_bf16fp32_mmla_8x3VL( "ld1h { z6.h }, p0/Z, [x22]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n" - "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n" - "ld1h { z4.h }, p0/Z, [x21]\n" - "ld1h { z5.h }, p0/Z, [x21, #1, MUL VL]\n" + "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n" + ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n" + "ld1h { z5.h }, p0/Z, [x21]\n" + "ld1h { z4.h }, p0/Z, [x21, #1, MUL VL]\n" ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n" ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n" + ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n" + ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n" "addvl x22, x22, #2\n" - ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n" - ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n" + ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n" + ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n" "addvl x21, x21, #2\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n" - ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n" - ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n" - ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n" - ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n" + ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n" + ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n" + ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n" + ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n" + ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n" + ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n" + ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n" + ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n" "cbz x20, 6f\n" - "ld1h { z6.h }, p0/Z, [x26]\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" - "ld1h { z7.h }, p0/Z, [x26, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z4.h }, p0/Z, [x22]\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n" - "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x21]\n" - "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n" - ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n" - ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n" + "ld1h { z1.h }, p0/Z, [x24]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1h { z0.h }, p0/Z, [x24, #1, MUL VL]\n" + ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n" + "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n" + ".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n" + ".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n" + ".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n" + "ld1h { z3.h }, p0/Z, [x22]\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n" + "ld1h { z2.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z1.h }, p0/Z, [x21]\n" + "ld1h { z0.h }, p0/Z, [x21, #1, MUL VL]\n" + ".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n" + ".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n" - ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n" - ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n" - ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n" - ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n" - ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n" - ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" - ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n" - ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" + ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" + ".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6463e49b // bfmmla z27.s, z4.h, z3.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6461e4d0 // bfmmla z16.s, z6.h, z1.h\n" + ".inst 0x6460e4d3 // bfmmla z19.s, z6.h, z0.h\n" + ".inst 0x6461e4b6 // bfmmla z22.s, z5.h, z1.h\n" + ".inst 0x6460e4b9 // bfmmla z25.s, z5.h, z0.h\n" + ".inst 0x6461e49c // bfmmla z28.s, z4.h, z1.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "6:" // multiply loop done - "decw x25, ALL, MUL #3\n" - "uzp1 z4.d, z8.d, z11.d\n" + "decw x26, ALL, MUL #3\n" + "uzp1 z0.d, z8.d, z11.d\n" "uzp2 z8.d, z8.d, z11.d\n" - "uzp1 z11.d, z9.d, z12.d\n" + "uzp1 z1.d, z9.d, z12.d\n" "uzp2 z9.d, z9.d, z12.d\n" - "st1w { z4.s }, p0, [%x[Cpanel]]\n" - "uzp1 z12.d, z10.d, z13.d\n" + "st1w { z0.s }, p0, [%x[Cpanel]]\n" + "uzp1 z0.d, z10.d, z13.d\n" "uzp2 z10.d, z10.d, z13.d\n" - "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n" - "uzp1 z13.d, z14.d, z17.d\n" + "st1w { z1.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "uzp1 z2.d, z14.d, z17.d\n" "uzp2 z14.d, z14.d, z17.d\n" "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "uzp1 z17.d, z15.d, z18.d\n" - "cmp x25, XZR\n" + "uzp1 z1.d, z15.d, z18.d\n" + "cmp x26, XZR\n" "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n" "uzp2 z15.d, z15.d, z18.d\n" - "uzp1 z18.d, z16.d, z19.d\n" + "uzp1 z17.d, z16.d, z19.d\n" "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "uzp2 z16.d, z16.d, z19.d\n" - "uzp1 z19.d, z20.d, z23.d\n" - "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "uzp1 z0.d, z20.d, z23.d\n" + "st1w { z2.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "uzp2 z20.d, z20.d, z23.d\n" "uzp1 z23.d, z21.d, z24.d\n" - "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n" "addvl %x[Cpanel], %x[Cpanel], #16\n" "uzp2 z21.d, z21.d, z24.d\n" - "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" - "uzp1 z24.d, z22.d, z25.d\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "uzp1 z19.d, z22.d, z25.d\n" "uzp2 z22.d, z22.d, z25.d\n" "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" - "uzp1 z25.d, z26.d, z29.d\n" + "uzp1 z18.d, z26.d, z29.d\n" "uzp2 z26.d, z26.d, z29.d\n" "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" - "uzp1 z29.d, z27.d, z30.d\n" + "uzp1 z17.d, z27.d, z30.d\n" "uzp2 z27.d, z27.d, z30.d\n" "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" - "uzp1 z30.d, z28.d, z31.d\n" + "uzp1 z16.d, z28.d, z31.d\n" "uzp2 z28.d, z28.d, z31.d\n" - "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" - "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" "st1w { z21.s }, p0, [%x[Cpanel]]\n" "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n" - "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n" "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp index 6d36bf8bbf..60f1b699c3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp index 1d502f5354..69ddb21c31 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/a64fx.cpp @@ -52,33 +52,33 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( __asm__ __volatile__( "ptrue p0.b\n" "1:" // Height loop - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x24, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x26, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x25, %x[Apanel]\n" "2:" // Width loop - "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" "cnth x23, ALL, MUL #2\n" - "add x22, x26, x20, LSL #1\n" + "add x22, x24, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" - "cmp x25, x23\n" + "cmp x26, x23\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov %x[Apanel], x24\n" + "mov %x[Apanel], x25\n" "bgt 3f\n" "dech x23\n" - "cmp x25, x23\n" - "mov x21, x26\n" + "cmp x26, x23\n" + "mov x21, x24\n" "bgt 3f\n" - "mov x22, x26\n" + "mov x22, x24\n" "3:" // B setup done "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" - "ld1h { z0.h }, p0/Z, [x26]\n" + "ld1h { z0.h }, p0/Z, [x24]\n" "mov z11.b, #0x0\n" "mov z12.b, #0x0\n" "ld1h { z1.h }, p0/Z, [x22]\n" @@ -116,12 +116,12 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "fmla z11.h, p0/M, z0.h, z4.h\n" "fmla z12.h, p0/M, z1.h, z4.h\n" "fmla z13.h, p0/M, z2.h, z4.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #10]\n" "fmla z14.h, p0/M, z0.h, z5.h\n" "fmla z15.h, p0/M, z1.h, z5.h\n" "cmp x20, #0x2\n" "fmla z16.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #12]\n" "fmla z17.h, p0/M, z0.h, z6.h\n" "fmla z18.h, p0/M, z1.h, z6.h\n" "fmla z19.h, p0/M, z2.h, z6.h\n" @@ -130,57 +130,57 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "fmla z21.h, p0/M, z1.h, z3.h\n" "fmla z22.h, p0/M, z2.h, z3.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n" - "fmla z23.h, p0/M, z0.h, z4.h\n" - "fmla z24.h, p0/M, z1.h, z4.h\n" - "fmla z25.h, p0/M, z2.h, z4.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n" + "fmla z23.h, p0/M, z0.h, z7.h\n" + "fmla z24.h, p0/M, z1.h, z7.h\n" + "fmla z25.h, p0/M, z2.h, z7.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #18]\n" + "fmla z26.h, p0/M, z0.h, z4.h\n" + "fmla z27.h, p0/M, z1.h, z4.h\n" + "fmla z28.h, p0/M, z2.h, z4.h\n" + "ld1rh { z4.h }, p0/Z, [%x[Apanel], #20]\n" "fmla z29.h, p0/M, z0.h, z6.h\n" - "ld1h { z0.h }, p0/Z, [x26, #1, MUL VL]\n" + "ld1h { z7.h }, p0/Z, [x24, #1, MUL VL]\n" "fmla z30.h, p0/M, z1.h, z6.h\n" "fmla z31.h, p0/M, z2.h, z6.h\n" - "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" + "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n" "ld1h { z2.h }, p0/Z, [x21, #1, MUL VL]\n" - "fmla z8.h, p0/M, z0.h, z3.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n" - "fmla z9.h, p0/M, z1.h, z3.h\n" + "fmla z8.h, p0/M, z7.h, z3.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n" + "fmla z9.h, p0/M, z6.h, z3.h\n" "fmla z10.h, p0/M, z2.h, z3.h\n" - "fmla z11.h, p0/M, z0.h, z4.h\n" + "fmla z11.h, p0/M, z7.h, z5.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n" - "fmla z12.h, p0/M, z1.h, z4.h\n" - "fmla z13.h, p0/M, z2.h, z4.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n" - "fmla z14.h, p0/M, z0.h, z5.h\n" - "fmla z15.h, p0/M, z1.h, z5.h\n" - "addvl x26, x26, #2\n" - "fmla z16.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n" - "fmla z17.h, p0/M, z0.h, z6.h\n" - "fmla z18.h, p0/M, z1.h, z6.h\n" - "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n" + "fmla z12.h, p0/M, z6.h, z5.h\n" + "fmla z13.h, p0/M, z2.h, z5.h\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #26]\n" + "fmla z14.h, p0/M, z7.h, z4.h\n" + "fmla z15.h, p0/M, z6.h, z4.h\n" + "addvl x24, x24, #2\n" + "fmla z16.h, p0/M, z2.h, z4.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n" + "fmla z17.h, p0/M, z7.h, z1.h\n" + "fmla z18.h, p0/M, z6.h, z1.h\n" + "fmla z19.h, p0/M, z2.h, z1.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n" "addvl x22, x22, #2\n" "addvl x21, x21, #2\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" + "fmla z20.h, p0/M, z7.h, z3.h\n" + "fmla z21.h, p0/M, z6.h, z3.h\n" "fmla z22.h, p0/M, z2.h, z3.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" - "fmla z23.h, p0/M, z0.h, z4.h\n" - "fmla z24.h, p0/M, z1.h, z4.h\n" - "fmla z25.h, p0/M, z2.h, z4.h\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" + "fmla z23.h, p0/M, z7.h, z5.h\n" + "fmla z24.h, p0/M, z6.h, z5.h\n" + "fmla z25.h, p0/M, z2.h, z5.h\n" + "fmla z26.h, p0/M, z7.h, z0.h\n" "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" + "fmla z27.h, p0/M, z6.h, z0.h\n" + "fmla z28.h, p0/M, z2.h, z0.h\n" "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "ld1h { z0.h }, p0/Z, [x26]\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z29.h, p0/M, z7.h, z1.h\n" + "ld1h { z0.h }, p0/Z, [x24]\n" + "fmla z30.h, p0/M, z6.h, z1.h\n" + "fmla z31.h, p0/M, z2.h, z1.h\n" "ld1h { z1.h }, p0/Z, [x22]\n" "ld1h { z2.h }, p0/Z, [x21]\n" "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" @@ -188,9 +188,9 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "5:" // main loop skip "fmla z8.h, p0/M, z0.h, z3.h\n" "fmla z9.h, p0/M, z1.h, z3.h\n" - "addvl x26, x26, #1\n" + "addvl x24, x24, #1\n" "fmla z10.h, p0/M, z2.h, z3.h\n" - "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n" "fmla z11.h, p0/M, z0.h, z4.h\n" "fmla z12.h, p0/M, z1.h, z4.h\n" "fmla z13.h, p0/M, z2.h, z4.h\n" @@ -203,11 +203,11 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "fmla z17.h, p0/M, z0.h, z6.h\n" "fmla z18.h, p0/M, z1.h, z6.h\n" "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z0.h, z7.h\n" + "fmla z21.h, p0/M, z1.h, z7.h\n" "addvl x21, x21, #1\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" + "fmla z22.h, p0/M, z2.h, z7.h\n" "fmla z23.h, p0/M, z0.h, z4.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" "fmla z24.h, p0/M, z1.h, z4.h\n" @@ -215,50 +215,50 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "fmla z26.h, p0/M, z0.h, z5.h\n" "fmla z27.h, p0/M, z1.h, z5.h\n" "fmla z28.h, p0/M, z2.h, z5.h\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z29.h, p0/M, z0.h, z3.h\n" + "fmla z30.h, p0/M, z1.h, z3.h\n" + "fmla z31.h, p0/M, z2.h, z3.h\n" "cbz x20, 6f\n" - "ld1h { z0.h }, p0/Z, [x26]\n" - "ld1h { z1.h }, p0/Z, [x22]\n" - "ld1h { z2.h }, p0/Z, [x21]\n" + "ld1h { z6.h }, p0/Z, [x24]\n" + "ld1h { z5.h }, p0/Z, [x22]\n" + "ld1h { z4.h }, p0/Z, [x21]\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" - "fmla z8.h, p0/M, z0.h, z3.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" - "fmla z9.h, p0/M, z1.h, z3.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" - "fmla z10.h, p0/M, z2.h, z3.h\n" - "fmla z11.h, p0/M, z0.h, z4.h\n" - "fmla z12.h, p0/M, z1.h, z4.h\n" - "fmla z13.h, p0/M, z2.h, z4.h\n" + "fmla z8.h, p0/M, z6.h, z3.h\n" + "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n" + "fmla z9.h, p0/M, z5.h, z3.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n" + "fmla z10.h, p0/M, z4.h, z3.h\n" + "fmla z11.h, p0/M, z6.h, z2.h\n" + "fmla z12.h, p0/M, z5.h, z2.h\n" + "fmla z13.h, p0/M, z4.h, z2.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" - "fmla z14.h, p0/M, z0.h, z5.h\n" - "fmla z15.h, p0/M, z1.h, z5.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" - "fmla z16.h, p0/M, z2.h, z5.h\n" - "fmla z17.h, p0/M, z0.h, z6.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" - "fmla z18.h, p0/M, z1.h, z6.h\n" - "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" + "fmla z14.h, p0/M, z6.h, z1.h\n" + "fmla z15.h, p0/M, z5.h, z1.h\n" + "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n" + "fmla z16.h, p0/M, z4.h, z1.h\n" + "fmla z17.h, p0/M, z6.h, z0.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n" + "fmla z18.h, p0/M, z5.h, z0.h\n" + "fmla z19.h, p0/M, z4.h, z0.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z6.h, z3.h\n" + "fmla z21.h, p0/M, z5.h, z3.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" - "fmla z23.h, p0/M, z0.h, z4.h\n" - "fmla z24.h, p0/M, z1.h, z4.h\n" - "fmla z25.h, p0/M, z2.h, z4.h\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z22.h, p0/M, z4.h, z3.h\n" + "fmla z23.h, p0/M, z6.h, z2.h\n" + "fmla z24.h, p0/M, z5.h, z2.h\n" + "fmla z25.h, p0/M, z4.h, z2.h\n" + "fmla z26.h, p0/M, z6.h, z1.h\n" + "fmla z27.h, p0/M, z5.h, z1.h\n" + "fmla z28.h, p0/M, z4.h, z1.h\n" + "fmla z29.h, p0/M, z6.h, z0.h\n" + "fmla z30.h, p0/M, z5.h, z0.h\n" + "fmla z31.h, p0/M, z4.h, z0.h\n" "6:" // multiply loop done - "dech x25, ALL, MUL #3\n" + "dech x26, ALL, MUL #3\n" "st1h { z8.h }, p0, [%x[Cpanel]]\n" - "cmp x25, XZR\n" + "cmp x26, XZR\n" "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n" "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n" @@ -289,7 +289,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp index de219aa2bf..23503fa108 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp16_mla_8x3VL/generic.cpp @@ -52,26 +52,26 @@ void sve_ffinterleaved_fp16_mla_8x3VL( __asm__ __volatile__( "ptrue p0.b\n" "1:" // Height loop - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x24, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x26, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x25, %x[Apanel]\n" "2:" // Width loop - "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" "cnth x23, ALL, MUL #2\n" - "add x22, x26, x20, LSL #1\n" + "add x22, x24, x20, LSL #1\n" "add x21, x22, x20, LSL #1\n" "add x20, x21, x20, LSL #1\n" - "cmp x25, x23\n" + "cmp x26, x23\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov %x[Apanel], x24\n" + "mov %x[Apanel], x25\n" "bgt 3f\n" "dech x23\n" - "cmp x25, x23\n" - "mov x21, x26\n" + "cmp x26, x23\n" + "mov x21, x24\n" "bgt 3f\n" - "mov x22, x26\n" + "mov x22, x24\n" "3:" // B setup done "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" @@ -81,7 +81,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL( "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" "mov z11.b, #0x0\n" "mov z12.b, #0x0\n" - "ld1h { z2.h }, p0/Z, [x26]\n" + "ld1h { z2.h }, p0/Z, [x24]\n" "mov z13.b, #0x0\n" "mov z14.b, #0x0\n" "ld1h { z3.h }, p0/Z, [x22]\n" @@ -107,19 +107,19 @@ void sve_ffinterleaved_fp16_mla_8x3VL( "4:" // main loop head "fmla z8.h, z2.h, z0.h[0]\n" "fmla z11.h, z2.h, z0.h[1]\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n" "fmla z14.h, z2.h, z0.h[2]\n" "fmla z17.h, z2.h, z0.h[3]\n" - "ld1h { z5.h }, p0/Z, [x26, #1, MUL VL]\n" + "ld1h { z6.h }, p0/Z, [x24, #1, MUL VL]\n" "fmla z20.h, z2.h, z0.h[4]\n" "fmla z23.h, z2.h, z0.h[5]\n" - "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n" + "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n" "fmla z26.h, z2.h, z0.h[6]\n" "fmla z29.h, z2.h, z0.h[7]\n" - "ld1h { z7.h }, p0/Z, [x21, #1, MUL VL]\n" + "ld1h { z1.h }, p0/Z, [x21, #1, MUL VL]\n" "fmla z9.h, z3.h, z0.h[0]\n" "fmla z12.h, z3.h, z0.h[1]\n" - "addvl x26, x26, #2\n" + "addvl x24, x24, #2\n" "fmla z15.h, z3.h, z0.h[2]\n" "fmla z18.h, z3.h, z0.h[3]\n" "addvl x22, x22, #2\n" @@ -137,36 +137,36 @@ void sve_ffinterleaved_fp16_mla_8x3VL( "add %x[Apanel], %x[Apanel], #0x20\n" "fmla z22.h, z4.h, z0.h[4]\n" "fmla z25.h, z4.h, z0.h[5]\n" - "ld1h { z2.h }, p0/Z, [x26]\n" + "ld1h { z2.h }, p0/Z, [x24]\n" "fmla z28.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "fmla z8.h, z5.h, z1.h[0]\n" - "fmla z11.h, z5.h, z1.h[1]\n" + "fmla z8.h, z6.h, z7.h[0]\n" + "fmla z11.h, z6.h, z7.h[1]\n" "ld1h { z3.h }, p0/Z, [x22]\n" - "fmla z14.h, z5.h, z1.h[2]\n" - "fmla z17.h, z5.h, z1.h[3]\n" + "fmla z14.h, z6.h, z7.h[2]\n" + "fmla z17.h, z6.h, z7.h[3]\n" "ld1h { z4.h }, p0/Z, [x21]\n" - "fmla z20.h, z5.h, z1.h[4]\n" - "fmla z23.h, z5.h, z1.h[5]\n" - "fmla z26.h, z5.h, z1.h[6]\n" - "fmla z29.h, z5.h, z1.h[7]\n" - "fmla z9.h, z6.h, z1.h[0]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z15.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z1.h[3]\n" - "fmla z21.h, z6.h, z1.h[4]\n" - "fmla z24.h, z6.h, z1.h[5]\n" - "fmla z27.h, z6.h, z1.h[6]\n" - "fmla z30.h, z6.h, z1.h[7]\n" - "fmla z10.h, z7.h, z1.h[0]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z16.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z1.h[3]\n" - "fmla z22.h, z7.h, z1.h[4]\n" - "fmla z25.h, z7.h, z1.h[5]\n" - "fmla z28.h, z7.h, z1.h[6]\n" - "fmla z31.h, z7.h, z1.h[7]\n" + "fmla z20.h, z6.h, z7.h[4]\n" + "fmla z23.h, z6.h, z7.h[5]\n" + "fmla z26.h, z6.h, z7.h[6]\n" + "fmla z29.h, z6.h, z7.h[7]\n" + "fmla z9.h, z5.h, z7.h[0]\n" + "fmla z12.h, z5.h, z7.h[1]\n" + "fmla z15.h, z5.h, z7.h[2]\n" + "fmla z18.h, z5.h, z7.h[3]\n" + "fmla z21.h, z5.h, z7.h[4]\n" + "fmla z24.h, z5.h, z7.h[5]\n" + "fmla z27.h, z5.h, z7.h[6]\n" + "fmla z30.h, z5.h, z7.h[7]\n" + "fmla z10.h, z1.h, z7.h[0]\n" + "fmla z13.h, z1.h, z7.h[1]\n" + "fmla z16.h, z1.h, z7.h[2]\n" + "fmla z19.h, z1.h, z7.h[3]\n" + "fmla z22.h, z1.h, z7.h[4]\n" + "fmla z25.h, z1.h, z7.h[5]\n" + "fmla z28.h, z1.h, z7.h[6]\n" + "fmla z31.h, z1.h, z7.h[7]\n" "bge 4b\n" "5:" // main loop skip "fmla z8.h, z2.h, z0.h[0]\n" @@ -174,7 +174,7 @@ void sve_ffinterleaved_fp16_mla_8x3VL( "add %x[Apanel], %x[Apanel], #0x10\n" "fmla z14.h, z2.h, z0.h[2]\n" "fmla z17.h, z2.h, z0.h[3]\n" - "addvl x26, x26, #1\n" + "addvl x24, x24, #1\n" "fmla z20.h, z2.h, z0.h[4]\n" "fmla z23.h, z2.h, z0.h[5]\n" "addvl x22, x22, #1\n" @@ -198,39 +198,39 @@ void sve_ffinterleaved_fp16_mla_8x3VL( "fmla z28.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "cbz x20, 6f\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "ld1h { z5.h }, p0/Z, [x26]\n" - "fmla z8.h, z5.h, z0.h[0]\n" - "ld1h { z6.h }, p0/Z, [x22]\n" - "ld1h { z7.h }, p0/Z, [x21]\n" - "fmla z11.h, z5.h, z0.h[1]\n" - "fmla z14.h, z5.h, z0.h[2]\n" - "fmla z17.h, z5.h, z0.h[3]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1h { z2.h }, p0/Z, [x24]\n" + "fmla z8.h, z2.h, z3.h[0]\n" + "ld1h { z1.h }, p0/Z, [x22]\n" + "ld1h { z0.h }, p0/Z, [x21]\n" + "fmla z11.h, z2.h, z3.h[1]\n" + "fmla z14.h, z2.h, z3.h[2]\n" + "fmla z17.h, z2.h, z3.h[3]\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla z20.h, z5.h, z0.h[4]\n" - "fmla z23.h, z5.h, z0.h[5]\n" - "fmla z26.h, z5.h, z0.h[6]\n" - "fmla z29.h, z5.h, z0.h[7]\n" - "fmla z9.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z0.h[1]\n" - "fmla z15.h, z6.h, z0.h[2]\n" - "fmla z18.h, z6.h, z0.h[3]\n" - "fmla z21.h, z6.h, z0.h[4]\n" - "fmla z24.h, z6.h, z0.h[5]\n" - "fmla z27.h, z6.h, z0.h[6]\n" - "fmla z30.h, z6.h, z0.h[7]\n" - "fmla z10.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z0.h[1]\n" - "fmla z16.h, z7.h, z0.h[2]\n" - "fmla z19.h, z7.h, z0.h[3]\n" - "fmla z22.h, z7.h, z0.h[4]\n" - "fmla z25.h, z7.h, z0.h[5]\n" - "fmla z28.h, z7.h, z0.h[6]\n" - "fmla z31.h, z7.h, z0.h[7]\n" + "fmla z20.h, z2.h, z3.h[4]\n" + "fmla z23.h, z2.h, z3.h[5]\n" + "fmla z26.h, z2.h, z3.h[6]\n" + "fmla z29.h, z2.h, z3.h[7]\n" + "fmla z9.h, z1.h, z3.h[0]\n" + "fmla z12.h, z1.h, z3.h[1]\n" + "fmla z15.h, z1.h, z3.h[2]\n" + "fmla z18.h, z1.h, z3.h[3]\n" + "fmla z21.h, z1.h, z3.h[4]\n" + "fmla z24.h, z1.h, z3.h[5]\n" + "fmla z27.h, z1.h, z3.h[6]\n" + "fmla z30.h, z1.h, z3.h[7]\n" + "fmla z10.h, z0.h, z3.h[0]\n" + "fmla z13.h, z0.h, z3.h[1]\n" + "fmla z16.h, z0.h, z3.h[2]\n" + "fmla z19.h, z0.h, z3.h[3]\n" + "fmla z22.h, z0.h, z3.h[4]\n" + "fmla z25.h, z0.h, z3.h[5]\n" + "fmla z28.h, z0.h, z3.h[6]\n" + "fmla z31.h, z0.h, z3.h[7]\n" "6:" // multiply loop done - "dech x25, ALL, MUL #3\n" + "dech x26, ALL, MUL #3\n" "st1h { z8.h }, p0, [%x[Cpanel]]\n" - "cmp x25, XZR\n" + "cmp x26, XZR\n" "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n" "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp index aa3507ee73..ac6986913d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp index 8c8b6b0675..c65c3a3ce4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/a64fx.cpp @@ -52,33 +52,33 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( __asm__ __volatile__( "ptrue p0.b\n" "1:" // Height loop - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x24, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x26, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x25, %x[Apanel]\n" "2:" // Width loop - "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" "cntw x23, ALL, MUL #2\n" - "add x22, x26, x20, LSL #2\n" + "add x22, x24, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" "add x20, x21, x20, LSL #2\n" - "cmp x25, x23\n" + "cmp x26, x23\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov %x[Apanel], x24\n" + "mov %x[Apanel], x25\n" "bgt 3f\n" "decw x23\n" - "cmp x25, x23\n" - "mov x21, x26\n" + "cmp x26, x23\n" + "mov x21, x24\n" "bgt 3f\n" - "mov x22, x26\n" + "mov x22, x24\n" "3:" // B setup done "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" "mov z8.b, #0x0\n" "mov z9.b, #0x0\n" "mov z10.b, #0x0\n" - "ld1w { z0.s }, p0/Z, [x26]\n" + "ld1w { z0.s }, p0/Z, [x24]\n" "mov z11.b, #0x0\n" "mov z12.b, #0x0\n" "ld1w { z1.s }, p0/Z, [x22]\n" @@ -116,12 +116,12 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "fmla z11.s, p0/M, z0.s, z4.s\n" "fmla z12.s, p0/M, z1.s, z4.s\n" "fmla z13.s, p0/M, z2.s, z4.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #20]\n" "fmla z14.s, p0/M, z0.s, z5.s\n" "fmla z15.s, p0/M, z1.s, z5.s\n" "cmp x20, #0x2\n" "fmla z16.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #24]\n" "fmla z17.s, p0/M, z0.s, z6.s\n" "fmla z18.s, p0/M, z1.s, z6.s\n" "fmla z19.s, p0/M, z2.s, z6.s\n" @@ -130,57 +130,57 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "fmla z21.s, p0/M, z1.s, z3.s\n" "fmla z22.s, p0/M, z2.s, z3.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" - "fmla z23.s, p0/M, z0.s, z4.s\n" - "fmla z24.s, p0/M, z1.s, z4.s\n" - "fmla z25.s, p0/M, z2.s, z4.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" + "fmla z23.s, p0/M, z0.s, z7.s\n" + "fmla z24.s, p0/M, z1.s, z7.s\n" + "fmla z25.s, p0/M, z2.s, z7.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #36]\n" + "fmla z26.s, p0/M, z0.s, z4.s\n" + "fmla z27.s, p0/M, z1.s, z4.s\n" + "fmla z28.s, p0/M, z2.s, z4.s\n" + "ld1rw { z4.s }, p0/Z, [%x[Apanel], #40]\n" "fmla z29.s, p0/M, z0.s, z6.s\n" - "ld1w { z0.s }, p0/Z, [x26, #1, MUL VL]\n" + "ld1w { z7.s }, p0/Z, [x24, #1, MUL VL]\n" "fmla z30.s, p0/M, z1.s, z6.s\n" "fmla z31.s, p0/M, z2.s, z6.s\n" - "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n" + "ld1w { z6.s }, p0/Z, [x22, #1, MUL VL]\n" "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n" - "fmla z8.s, p0/M, z0.s, z3.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" - "fmla z9.s, p0/M, z1.s, z3.s\n" + "fmla z8.s, p0/M, z7.s, z3.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n" + "fmla z9.s, p0/M, z6.s, z3.s\n" "fmla z10.s, p0/M, z2.s, z3.s\n" - "fmla z11.s, p0/M, z0.s, z4.s\n" + "fmla z11.s, p0/M, z7.s, z5.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" - "fmla z12.s, p0/M, z1.s, z4.s\n" - "fmla z13.s, p0/M, z2.s, z4.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" - "fmla z14.s, p0/M, z0.s, z5.s\n" - "fmla z15.s, p0/M, z1.s, z5.s\n" - "addvl x26, x26, #2\n" - "fmla z16.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" - "fmla z17.s, p0/M, z0.s, z6.s\n" - "fmla z18.s, p0/M, z1.s, z6.s\n" - "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "fmla z12.s, p0/M, z6.s, z5.s\n" + "fmla z13.s, p0/M, z2.s, z5.s\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #52]\n" + "fmla z14.s, p0/M, z7.s, z4.s\n" + "fmla z15.s, p0/M, z6.s, z4.s\n" + "addvl x24, x24, #2\n" + "fmla z16.s, p0/M, z2.s, z4.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n" + "fmla z17.s, p0/M, z7.s, z1.s\n" + "fmla z18.s, p0/M, z6.s, z1.s\n" + "fmla z19.s, p0/M, z2.s, z1.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n" "addvl x22, x22, #2\n" "addvl x21, x21, #2\n" "add %x[Apanel], %x[Apanel], #0x40\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" + "fmla z20.s, p0/M, z7.s, z3.s\n" + "fmla z21.s, p0/M, z6.s, z3.s\n" "fmla z22.s, p0/M, z2.s, z3.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "fmla z23.s, p0/M, z0.s, z4.s\n" - "fmla z24.s, p0/M, z1.s, z4.s\n" - "fmla z25.s, p0/M, z2.s, z4.s\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" + "fmla z23.s, p0/M, z7.s, z5.s\n" + "fmla z24.s, p0/M, z6.s, z5.s\n" + "fmla z25.s, p0/M, z2.s, z5.s\n" + "fmla z26.s, p0/M, z7.s, z0.s\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" + "fmla z27.s, p0/M, z6.s, z0.s\n" + "fmla z28.s, p0/M, z2.s, z0.s\n" "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "ld1w { z0.s }, p0/Z, [x26]\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z29.s, p0/M, z7.s, z1.s\n" + "ld1w { z0.s }, p0/Z, [x24]\n" + "fmla z30.s, p0/M, z6.s, z1.s\n" + "fmla z31.s, p0/M, z2.s, z1.s\n" "ld1w { z1.s }, p0/Z, [x22]\n" "ld1w { z2.s }, p0/Z, [x21]\n" "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" @@ -188,9 +188,9 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "5:" // main loop skip "fmla z8.s, p0/M, z0.s, z3.s\n" "fmla z9.s, p0/M, z1.s, z3.s\n" - "addvl x26, x26, #1\n" + "addvl x24, x24, #1\n" "fmla z10.s, p0/M, z2.s, z3.s\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "fmla z11.s, p0/M, z0.s, z4.s\n" "fmla z12.s, p0/M, z1.s, z4.s\n" "fmla z13.s, p0/M, z2.s, z4.s\n" @@ -203,11 +203,11 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "fmla z17.s, p0/M, z0.s, z6.s\n" "fmla z18.s, p0/M, z1.s, z6.s\n" "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z0.s, z7.s\n" + "fmla z21.s, p0/M, z1.s, z7.s\n" "addvl x21, x21, #1\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" + "fmla z22.s, p0/M, z2.s, z7.s\n" "fmla z23.s, p0/M, z0.s, z4.s\n" "add %x[Apanel], %x[Apanel], #0x20\n" "fmla z24.s, p0/M, z1.s, z4.s\n" @@ -215,50 +215,50 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "fmla z26.s, p0/M, z0.s, z5.s\n" "fmla z27.s, p0/M, z1.s, z5.s\n" "fmla z28.s, p0/M, z2.s, z5.s\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z29.s, p0/M, z0.s, z3.s\n" + "fmla z30.s, p0/M, z1.s, z3.s\n" + "fmla z31.s, p0/M, z2.s, z3.s\n" "cbz x20, 6f\n" - "ld1w { z0.s }, p0/Z, [x26]\n" - "ld1w { z1.s }, p0/Z, [x22]\n" - "ld1w { z2.s }, p0/Z, [x21]\n" + "ld1w { z6.s }, p0/Z, [x24]\n" + "ld1w { z5.s }, p0/Z, [x22]\n" + "ld1w { z4.s }, p0/Z, [x21]\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "fmla z8.s, p0/M, z0.s, z3.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" - "fmla z9.s, p0/M, z1.s, z3.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" - "fmla z10.s, p0/M, z2.s, z3.s\n" - "fmla z11.s, p0/M, z0.s, z4.s\n" - "fmla z12.s, p0/M, z1.s, z4.s\n" - "fmla z13.s, p0/M, z2.s, z4.s\n" + "fmla z8.s, p0/M, z6.s, z3.s\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n" + "fmla z9.s, p0/M, z5.s, z3.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n" + "fmla z10.s, p0/M, z4.s, z3.s\n" + "fmla z11.s, p0/M, z6.s, z2.s\n" + "fmla z12.s, p0/M, z5.s, z2.s\n" + "fmla z13.s, p0/M, z4.s, z2.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" - "fmla z14.s, p0/M, z0.s, z5.s\n" - "fmla z15.s, p0/M, z1.s, z5.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" - "fmla z16.s, p0/M, z2.s, z5.s\n" - "fmla z17.s, p0/M, z0.s, z6.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" - "fmla z18.s, p0/M, z1.s, z6.s\n" - "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" + "fmla z14.s, p0/M, z6.s, z1.s\n" + "fmla z15.s, p0/M, z5.s, z1.s\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n" + "fmla z16.s, p0/M, z4.s, z1.s\n" + "fmla z17.s, p0/M, z6.s, z0.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n" + "fmla z18.s, p0/M, z5.s, z0.s\n" + "fmla z19.s, p0/M, z4.s, z0.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z6.s, z3.s\n" + "fmla z21.s, p0/M, z5.s, z3.s\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" - "fmla z23.s, p0/M, z0.s, z4.s\n" - "fmla z24.s, p0/M, z1.s, z4.s\n" - "fmla z25.s, p0/M, z2.s, z4.s\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z22.s, p0/M, z4.s, z3.s\n" + "fmla z23.s, p0/M, z6.s, z2.s\n" + "fmla z24.s, p0/M, z5.s, z2.s\n" + "fmla z25.s, p0/M, z4.s, z2.s\n" + "fmla z26.s, p0/M, z6.s, z1.s\n" + "fmla z27.s, p0/M, z5.s, z1.s\n" + "fmla z28.s, p0/M, z4.s, z1.s\n" + "fmla z29.s, p0/M, z6.s, z0.s\n" + "fmla z30.s, p0/M, z5.s, z0.s\n" + "fmla z31.s, p0/M, z4.s, z0.s\n" "6:" // multiply loop done - "decw x25, ALL, MUL #3\n" + "decw x26, ALL, MUL #3\n" "st1w { z8.s }, p0, [%x[Cpanel]]\n" - "cmp x25, XZR\n" + "cmp x26, XZR\n" "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n" "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n" @@ -289,7 +289,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp index 4a0b31daff..4b20be6f01 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_ffinterleaved_fp32_mla_8x3VL/generic.cpp @@ -52,26 +52,26 @@ void sve_ffinterleaved_fp32_mla_8x3VL( __asm__ __volatile__( "ptrue p0.b\n" "1:" // Height loop - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "str x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov x24, %x[Apanel]\n" + "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" + "ldr x26, [%x[args_ptr], %[offsetof_N]]\n" + "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "mov x25, %x[Apanel]\n" "2:" // Width loop - "ldr x26, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" + "ldr x24, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n" "cntw x23, ALL, MUL #2\n" - "add x22, x26, x20, LSL #2\n" + "add x22, x24, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" "add x20, x21, x20, LSL #2\n" - "cmp x25, x23\n" + "cmp x26, x23\n" "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n" - "mov %x[Apanel], x24\n" + "mov %x[Apanel], x25\n" "bgt 3f\n" "decw x23\n" - "cmp x25, x23\n" - "mov x21, x26\n" + "cmp x26, x23\n" + "mov x21, x24\n" "bgt 3f\n" - "mov x22, x26\n" + "mov x22, x24\n" "3:" // B setup done "ldr x20, [%x[args_ptr], %[offsetof_K]]\n" "cmp x20, #0x2\n" @@ -84,7 +84,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL( "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n" "mov z13.b, #0x0\n" "mov z14.b, #0x0\n" - "ld1w { z4.s }, p0/Z, [x26]\n" + "ld1w { z4.s }, p0/Z, [x24]\n" "mov z15.b, #0x0\n" "mov z16.b, #0x0\n" "ld1w { z5.s }, p0/Z, [x22]\n" @@ -108,19 +108,19 @@ void sve_ffinterleaved_fp32_mla_8x3VL( "4:" // main loop head "fmla z8.s, z4.s, z0.s[0]\n" "fmla z11.s, z4.s, z0.s[1]\n" - "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n" "fmla z14.s, z4.s, z0.s[2]\n" "fmla z17.s, z4.s, z0.s[3]\n" - "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n" "fmla z20.s, z4.s, z1.s[0]\n" "fmla z23.s, z4.s, z1.s[1]\n" "sub x20, x20, #0x2\n" "fmla z26.s, z4.s, z1.s[2]\n" "fmla z29.s, z4.s, z1.s[3]\n" - "ld1w { z4.s }, p0/Z, [x26, #1, MUL VL]\n" + "ld1w { z4.s }, p0/Z, [x24, #1, MUL VL]\n" "fmla z9.s, z5.s, z0.s[0]\n" "fmla z12.s, z5.s, z0.s[1]\n" - "addvl x26, x26, #2\n" + "addvl x24, x24, #2\n" "fmla z15.s, z5.s, z0.s[2]\n" "fmla z18.s, z5.s, z0.s[3]\n" "cmp x20, #0x2\n" @@ -140,35 +140,35 @@ void sve_ffinterleaved_fp32_mla_8x3VL( "fmla z25.s, z6.s, z1.s[1]\n" "fmla z28.s, z6.s, z1.s[2]\n" "fmla z31.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p0/Z, [x21, #1, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x21, #1, MUL VL]\n" "addvl x21, x21, #2\n" - "fmla z8.s, z4.s, z2.s[0]\n" - "fmla z11.s, z4.s, z2.s[1]\n" - "fmla z14.s, z4.s, z2.s[2]\n" - "fmla z17.s, z4.s, z2.s[3]\n" + "fmla z8.s, z4.s, z3.s[0]\n" + "fmla z11.s, z4.s, z3.s[1]\n" + "fmla z14.s, z4.s, z3.s[2]\n" + "fmla z17.s, z4.s, z3.s[3]\n" "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n" - "fmla z20.s, z4.s, z3.s[0]\n" - "fmla z23.s, z4.s, z3.s[1]\n" - "fmla z26.s, z4.s, z3.s[2]\n" - "fmla z29.s, z4.s, z3.s[3]\n" - "ld1w { z4.s }, p0/Z, [x26]\n" - "fmla z9.s, z5.s, z2.s[0]\n" - "fmla z12.s, z5.s, z2.s[1]\n" - "fmla z15.s, z5.s, z2.s[2]\n" - "fmla z18.s, z5.s, z2.s[3]\n" - "fmla z21.s, z5.s, z3.s[0]\n" - "fmla z24.s, z5.s, z3.s[1]\n" - "fmla z27.s, z5.s, z3.s[2]\n" - "fmla z30.s, z5.s, z3.s[3]\n" + "fmla z20.s, z4.s, z7.s[0]\n" + "fmla z23.s, z4.s, z7.s[1]\n" + "fmla z26.s, z4.s, z7.s[2]\n" + "fmla z29.s, z4.s, z7.s[3]\n" + "ld1w { z4.s }, p0/Z, [x24]\n" + "fmla z9.s, z5.s, z3.s[0]\n" + "fmla z12.s, z5.s, z3.s[1]\n" + "fmla z15.s, z5.s, z3.s[2]\n" + "fmla z18.s, z5.s, z3.s[3]\n" + "fmla z21.s, z5.s, z7.s[0]\n" + "fmla z24.s, z5.s, z7.s[1]\n" + "fmla z27.s, z5.s, z7.s[2]\n" + "fmla z30.s, z5.s, z7.s[3]\n" "ld1w { z5.s }, p0/Z, [x22]\n" - "fmla z10.s, z6.s, z2.s[0]\n" - "fmla z13.s, z6.s, z2.s[1]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z19.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z25.s, z6.s, z3.s[1]\n" - "fmla z28.s, z6.s, z3.s[2]\n" - "fmla z31.s, z6.s, z3.s[3]\n" + "fmla z10.s, z2.s, z3.s[0]\n" + "fmla z13.s, z2.s, z3.s[1]\n" + "fmla z16.s, z2.s, z3.s[2]\n" + "fmla z19.s, z2.s, z3.s[3]\n" + "fmla z22.s, z2.s, z7.s[0]\n" + "fmla z25.s, z2.s, z7.s[1]\n" + "fmla z28.s, z2.s, z7.s[2]\n" + "fmla z31.s, z2.s, z7.s[3]\n" "ld1w { z6.s }, p0/Z, [x21]\n" "bge 4b\n" "5:" // main loop skip @@ -177,7 +177,7 @@ void sve_ffinterleaved_fp32_mla_8x3VL( "add %x[Apanel], %x[Apanel], #0x20\n" "fmla z14.s, z4.s, z0.s[2]\n" "fmla z17.s, z4.s, z0.s[3]\n" - "addvl x26, x26, #1\n" + "addvl x24, x24, #1\n" "fmla z20.s, z4.s, z1.s[0]\n" "fmla z23.s, z4.s, z1.s[1]\n" "addvl x22, x22, #1\n" @@ -201,40 +201,40 @@ void sve_ffinterleaved_fp32_mla_8x3VL( "fmla z28.s, z6.s, z1.s[2]\n" "fmla z31.s, z6.s, z1.s[3]\n" "cbz x20, 6f\n" - "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n" - "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n" + "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1w { z7.s }, p0/Z, [x26]\n" - "ld1w { z4.s }, p0/Z, [x22]\n" - "fmla z8.s, z7.s, z0.s[0]\n" - "ld1w { z5.s }, p0/Z, [x21]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z14.s, z7.s, z0.s[2]\n" - "fmla z17.s, z7.s, z0.s[3]\n" - "fmla z20.s, z7.s, z1.s[0]\n" - "fmla z23.s, z7.s, z1.s[1]\n" - "fmla z26.s, z7.s, z1.s[2]\n" - "fmla z29.s, z7.s, z1.s[3]\n" - "fmla z9.s, z4.s, z0.s[0]\n" - "fmla z12.s, z4.s, z0.s[1]\n" - "fmla z15.s, z4.s, z0.s[2]\n" - "fmla z18.s, z4.s, z0.s[3]\n" - "fmla z21.s, z4.s, z1.s[0]\n" - "fmla z24.s, z4.s, z1.s[1]\n" - "fmla z27.s, z4.s, z1.s[2]\n" - "fmla z30.s, z4.s, z1.s[3]\n" - "fmla z10.s, z5.s, z0.s[0]\n" - "fmla z13.s, z5.s, z0.s[1]\n" - "fmla z16.s, z5.s, z0.s[2]\n" - "fmla z19.s, z5.s, z0.s[3]\n" - "fmla z22.s, z5.s, z1.s[0]\n" - "fmla z25.s, z5.s, z1.s[1]\n" - "fmla z28.s, z5.s, z1.s[2]\n" - "fmla z31.s, z5.s, z1.s[3]\n" + "ld1w { z2.s }, p0/Z, [x24]\n" + "ld1w { z1.s }, p0/Z, [x22]\n" + "fmla z8.s, z2.s, z4.s[0]\n" + "ld1w { z0.s }, p0/Z, [x21]\n" + "fmla z11.s, z2.s, z4.s[1]\n" + "fmla z14.s, z2.s, z4.s[2]\n" + "fmla z17.s, z2.s, z4.s[3]\n" + "fmla z20.s, z2.s, z3.s[0]\n" + "fmla z23.s, z2.s, z3.s[1]\n" + "fmla z26.s, z2.s, z3.s[2]\n" + "fmla z29.s, z2.s, z3.s[3]\n" + "fmla z9.s, z1.s, z4.s[0]\n" + "fmla z12.s, z1.s, z4.s[1]\n" + "fmla z15.s, z1.s, z4.s[2]\n" + "fmla z18.s, z1.s, z4.s[3]\n" + "fmla z21.s, z1.s, z3.s[0]\n" + "fmla z24.s, z1.s, z3.s[1]\n" + "fmla z27.s, z1.s, z3.s[2]\n" + "fmla z30.s, z1.s, z3.s[3]\n" + "fmla z10.s, z0.s, z4.s[0]\n" + "fmla z13.s, z0.s, z4.s[1]\n" + "fmla z16.s, z0.s, z4.s[2]\n" + "fmla z19.s, z0.s, z4.s[3]\n" + "fmla z22.s, z0.s, z3.s[0]\n" + "fmla z25.s, z0.s, z3.s[1]\n" + "fmla z28.s, z0.s, z3.s[2]\n" + "fmla z31.s, z0.s, z3.s[3]\n" "6:" // multiply loop done - "decw x25, ALL, MUL #3\n" + "decw x26, ALL, MUL #3\n" "st1w { z8.s }, p0, [%x[Cpanel]]\n" - "cmp x25, XZR\n" + "cmp x26, XZR\n" "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n" "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp index 6677c23216..49ccce342e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -75,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -100,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp index f0b00e6251..176f6e0d3a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp @@ -140,11 +140,11 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -157,87 +157,87 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10]\n" + ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460420a // bfdot z10.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x64684208 // bfdot z8.s, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6468420a // bfdot z10.s, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n" + ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n" + ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n" + ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n" + ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n" "add x26, x26, #0x10\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10]\n" + ".inst 0x64604208 // bfdot z8.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x2\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n" + ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n" + ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" + ".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n" + ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n" + ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" + ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n" + ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n" + ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n" + ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n" "addvl x10, x10, #4\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -246,17 +246,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "bne 6b\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -296,15 +296,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" "b 18f\n" "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -320,12 +320,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -333,130 +333,130 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "21:" // Height 2: input setup done "cmp x27, #0x8\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z1.h }, p0/Z, [x26]\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64614228 // bfdot z8.s, z17.h, z1.h[0]\n" + ".inst 0x6460422c // bfdot z12.s, z17.h, z0.h[0]\n" + ".inst 0x64614209 // bfdot z9.s, z16.h, z1.h[0]\n" + ".inst 0x6460420d // bfdot z13.s, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461422a // bfdot z10.s, z17.h, z1.h[0]\n" + ".inst 0x6460422e // bfdot z14.s, z17.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" "cmp x27, #0x8\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461420b // bfdot z11.s, z16.h, z1.h[0]\n" + ".inst 0x6460420f // bfdot z15.s, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x64694228 // bfdot z8.s, z17.h, z1.h[1]\n" + ".inst 0x6468422c // bfdot z12.s, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x64694209 // bfdot z9.s, z16.h, z1.h[1]\n" + ".inst 0x6468420d // bfdot z13.s, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x6469422a // bfdot z10.s, z17.h, z1.h[1]\n" + ".inst 0x6468422e // bfdot z14.s, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6469420b // bfdot z11.s, z16.h, z1.h[1]\n" + ".inst 0x6468420f // bfdot z15.s, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x64714228 // bfdot z8.s, z17.h, z1.h[2]\n" + ".inst 0x6470422c // bfdot z12.s, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x64714209 // bfdot z9.s, z16.h, z1.h[2]\n" + ".inst 0x6470420d // bfdot z13.s, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6471422a // bfdot z10.s, z17.h, z1.h[2]\n" + ".inst 0x6470422e // bfdot z14.s, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6471420b // bfdot z11.s, z16.h, z1.h[2]\n" + ".inst 0x6470420f // bfdot z15.s, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x64794228 // bfdot z8.s, z17.h, z1.h[3]\n" + ".inst 0x6478422c // bfdot z12.s, z17.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x64794209 // bfdot z9.s, z16.h, z1.h[3]\n" + ".inst 0x6478420d // bfdot z13.s, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6479422a // bfdot z10.s, z17.h, z1.h[3]\n" + ".inst 0x6478422e // bfdot z14.s, z17.h, z0.h[3]\n" + ".inst 0x6479420b // bfdot z11.s, z16.h, z1.h[3]\n" + ".inst 0x6478420f // bfdot z15.s, z16.h, z0.h[3]\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x2\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64604228 // bfdot z8.s, z17.h, z0.h[0]\n" + ".inst 0x6461422c // bfdot z12.s, z17.h, z1.h[0]\n" + ".inst 0x64604209 // bfdot z9.s, z16.h, z0.h[0]\n" + ".inst 0x6461420d // bfdot z13.s, z16.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6460422a // bfdot z10.s, z17.h, z0.h[0]\n" + ".inst 0x6461422e // bfdot z14.s, z17.h, z1.h[0]\n" "addvl x10, x10, #4\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" + ".inst 0x6460420b // bfdot z11.s, z16.h, z0.h[0]\n" + ".inst 0x6461420f // bfdot z15.s, z16.h, z1.h[0]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64684228 // bfdot z8.s, z17.h, z0.h[1]\n" + ".inst 0x6469422c // bfdot z12.s, z17.h, z1.h[1]\n" + ".inst 0x64684209 // bfdot z9.s, z16.h, z0.h[1]\n" + ".inst 0x6469420d // bfdot z13.s, z16.h, z1.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" + ".inst 0x6468422a // bfdot z10.s, z17.h, z0.h[1]\n" + ".inst 0x6469422e // bfdot z14.s, z17.h, z1.h[1]\n" "addvl x10, x10, #4\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" + ".inst 0x6468420b // bfdot z11.s, z16.h, z0.h[1]\n" + ".inst 0x6469420f // bfdot z15.s, z16.h, z1.h[1]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64704228 // bfdot z8.s, z17.h, z0.h[2]\n" + ".inst 0x6471422c // bfdot z12.s, z17.h, z1.h[2]\n" + ".inst 0x64704209 // bfdot z9.s, z16.h, z0.h[2]\n" + ".inst 0x6471420d // bfdot z13.s, z16.h, z1.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" + ".inst 0x6470422a // bfdot z10.s, z17.h, z0.h[2]\n" + ".inst 0x6471422e // bfdot z14.s, z17.h, z1.h[2]\n" "addvl x10, x10, #4\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" + ".inst 0x6470420b // bfdot z11.s, z16.h, z0.h[2]\n" + ".inst 0x6471420f // bfdot z15.s, z16.h, z1.h[2]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64784228 // bfdot z8.s, z17.h, z0.h[3]\n" + ".inst 0x6479422c // bfdot z12.s, z17.h, z1.h[3]\n" + ".inst 0x64784209 // bfdot z9.s, z16.h, z0.h[3]\n" + ".inst 0x6479420d // bfdot z13.s, z16.h, z1.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6478422a // bfdot z10.s, z17.h, z0.h[3]\n" + ".inst 0x6479422e // bfdot z14.s, z17.h, z1.h[3]\n" "addvl x10, x10, #4\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" + ".inst 0x6478420b // bfdot z11.s, z16.h, z0.h[3]\n" + ".inst 0x6479420f // bfdot z15.s, z16.h, z1.h[3]\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -466,25 +466,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "add x25, x9, x20, LSL #2\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z15.s, p5/M, z15.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z15.s, p5/M, z15.s, z16.s\n" "25:" // Height 2: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -532,20 +532,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20]\n" + "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n" "b 31f\n" "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -565,13 +565,13 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -580,86 +580,86 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "34:" // Height 3: input setup done "cmp x27, #0x8\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1rqh { z0.h }, p0/Z, [x24]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + ".inst 0x646242a8 // bfdot z8.s, z21.h, z2.h[0]\n" + ".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646042b0 // bfdot z16.s, z21.h, z0.h[0]\n" + ".inst 0x64624289 // bfdot z9.s, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n" + ".inst 0x64604291 // bfdot z17.s, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "cmp x27, #0x8\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" + ".inst 0x646242aa // bfdot z10.s, z21.h, z2.h[0]\n" + ".inst 0x646142ae // bfdot z14.s, z21.h, z1.h[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x646042b2 // bfdot z18.s, z21.h, z0.h[0]\n" + ".inst 0x6462428b // bfdot z11.s, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6461428f // bfdot z15.s, z20.h, z1.h[0]\n" + ".inst 0x64604293 // bfdot z19.s, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x646a42a8 // bfdot z8.s, z21.h, z2.h[1]\n" + ".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n" + ".inst 0x646842b0 // bfdot z16.s, z21.h, z0.h[1]\n" + ".inst 0x646a4289 // bfdot z9.s, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n" + ".inst 0x64684291 // bfdot z17.s, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x646a42aa // bfdot z10.s, z21.h, z2.h[1]\n" + ".inst 0x646942ae // bfdot z14.s, z21.h, z1.h[1]\n" + ".inst 0x646842b2 // bfdot z18.s, z21.h, z0.h[1]\n" + ".inst 0x646a428b // bfdot z11.s, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6469428f // bfdot z15.s, z20.h, z1.h[1]\n" + ".inst 0x64684293 // bfdot z19.s, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x647242a8 // bfdot z8.s, z21.h, z2.h[2]\n" + ".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n" + ".inst 0x647042b0 // bfdot z16.s, z21.h, z0.h[2]\n" + ".inst 0x64724289 // bfdot z9.s, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n" + ".inst 0x64704291 // bfdot z17.s, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x647242aa // bfdot z10.s, z21.h, z2.h[2]\n" + ".inst 0x647142ae // bfdot z14.s, z21.h, z1.h[2]\n" + ".inst 0x647042b2 // bfdot z18.s, z21.h, z0.h[2]\n" + ".inst 0x6472428b // bfdot z11.s, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6471428f // bfdot z15.s, z20.h, z1.h[2]\n" + ".inst 0x64704293 // bfdot z19.s, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x647a42a8 // bfdot z8.s, z21.h, z2.h[3]\n" + ".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n" + ".inst 0x647842b0 // bfdot z16.s, z21.h, z0.h[3]\n" + ".inst 0x647a4289 // bfdot z9.s, z20.h, z2.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n" + ".inst 0x64784291 // bfdot z17.s, z20.h, z0.h[3]\n" + "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x647a42aa // bfdot z10.s, z21.h, z2.h[3]\n" + ".inst 0x647942ae // bfdot z14.s, z21.h, z1.h[3]\n" + ".inst 0x647842b2 // bfdot z18.s, z21.h, z0.h[3]\n" + ".inst 0x647a428b // bfdot z11.s, z20.h, z2.h[3]\n" + ".inst 0x6479428f // bfdot z15.s, z20.h, z1.h[3]\n" + ".inst 0x64784293 // bfdot z19.s, z20.h, z0.h[3]\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -667,79 +667,79 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x2\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + ".inst 0x646042a8 // bfdot z8.s, z21.h, z0.h[0]\n" + ".inst 0x646142ac // bfdot z12.s, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646242b0 // bfdot z16.s, z21.h, z2.h[0]\n" + ".inst 0x64604289 // bfdot z9.s, z20.h, z0.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6461428d // bfdot z13.s, z20.h, z1.h[0]\n" + ".inst 0x64624291 // bfdot z17.s, z20.h, z2.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" + ".inst 0x646042aa // bfdot z10.s, z21.h, z0.h[0]\n" + ".inst 0x646142ae // bfdot z14.s, z21.h, z1.h[0]\n" + ".inst 0x646242b2 // bfdot z18.s, z21.h, z2.h[0]\n" + ".inst 0x6460428b // bfdot z11.s, z20.h, z0.h[0]\n" + ".inst 0x6461428f // bfdot z15.s, z20.h, z1.h[0]\n" + ".inst 0x64624293 // bfdot z19.s, z20.h, z2.h[0]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646842a8 // bfdot z8.s, z21.h, z0.h[1]\n" + ".inst 0x646942ac // bfdot z12.s, z21.h, z1.h[1]\n" + ".inst 0x646a42b0 // bfdot z16.s, z21.h, z2.h[1]\n" + ".inst 0x64684289 // bfdot z9.s, z20.h, z0.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6469428d // bfdot z13.s, z20.h, z1.h[1]\n" + ".inst 0x646a4291 // bfdot z17.s, z20.h, z2.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" + ".inst 0x646842aa // bfdot z10.s, z21.h, z0.h[1]\n" + ".inst 0x646942ae // bfdot z14.s, z21.h, z1.h[1]\n" + ".inst 0x646a42b2 // bfdot z18.s, z21.h, z2.h[1]\n" + ".inst 0x6468428b // bfdot z11.s, z20.h, z0.h[1]\n" + ".inst 0x6469428f // bfdot z15.s, z20.h, z1.h[1]\n" + ".inst 0x646a4293 // bfdot z19.s, z20.h, z2.h[1]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647042a8 // bfdot z8.s, z21.h, z0.h[2]\n" + ".inst 0x647142ac // bfdot z12.s, z21.h, z1.h[2]\n" + ".inst 0x647242b0 // bfdot z16.s, z21.h, z2.h[2]\n" + ".inst 0x64704289 // bfdot z9.s, z20.h, z0.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471428d // bfdot z13.s, z20.h, z1.h[2]\n" + ".inst 0x64724291 // bfdot z17.s, z20.h, z2.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" + ".inst 0x647042aa // bfdot z10.s, z21.h, z0.h[2]\n" + ".inst 0x647142ae // bfdot z14.s, z21.h, z1.h[2]\n" + ".inst 0x647242b2 // bfdot z18.s, z21.h, z2.h[2]\n" + ".inst 0x6470428b // bfdot z11.s, z20.h, z0.h[2]\n" + ".inst 0x6471428f // bfdot z15.s, z20.h, z1.h[2]\n" + ".inst 0x64724293 // bfdot z19.s, z20.h, z2.h[2]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647842a8 // bfdot z8.s, z21.h, z0.h[3]\n" + ".inst 0x647942ac // bfdot z12.s, z21.h, z1.h[3]\n" + ".inst 0x647a42b0 // bfdot z16.s, z21.h, z2.h[3]\n" + ".inst 0x64784289 // bfdot z9.s, z20.h, z0.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6479428d // bfdot z13.s, z20.h, z1.h[3]\n" + ".inst 0x647a4291 // bfdot z17.s, z20.h, z2.h[3]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" + ".inst 0x647842aa // bfdot z10.s, z21.h, z0.h[3]\n" + ".inst 0x647942ae // bfdot z14.s, z21.h, z1.h[3]\n" + ".inst 0x647a42b2 // bfdot z18.s, z21.h, z2.h[3]\n" + ".inst 0x6478428b // bfdot z11.s, z20.h, z0.h[3]\n" + ".inst 0x6479428f // bfdot z15.s, z20.h, z1.h[3]\n" + ".inst 0x647a4293 // bfdot z19.s, z20.h, z2.h[3]\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -750,33 +750,33 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z21.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z20.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z21.s\n" + "fmin z9.s, p5/M, z9.s, z21.s\n" + "fmin z10.s, p5/M, z10.s, z21.s\n" + "fmin z11.s, p5/M, z11.s, z21.s\n" + "fmin z12.s, p5/M, z12.s, z21.s\n" + "fmin z13.s, p5/M, z13.s, z21.s\n" + "fmin z14.s, p5/M, z14.s, z21.s\n" + "fmin z15.s, p5/M, z15.s, z21.s\n" + "fmin z16.s, p5/M, z16.s, z21.s\n" + "fmin z17.s, p5/M, z17.s, z21.s\n" + "fmin z18.s, p5/M, z18.s, z21.s\n" + "fmin z19.s, p5/M, z19.s, z21.s\n" + "fmax z8.s, p5/M, z8.s, z20.s\n" + "fmax z9.s, p5/M, z9.s, z20.s\n" + "fmax z10.s, p5/M, z10.s, z20.s\n" + "fmax z11.s, p5/M, z11.s, z20.s\n" + "fmax z12.s, p5/M, z12.s, z20.s\n" + "fmax z13.s, p5/M, z13.s, z20.s\n" + "fmax z14.s, p5/M, z14.s, z20.s\n" + "fmax z15.s, p5/M, z15.s, z20.s\n" + "fmax z16.s, p5/M, z16.s, z20.s\n" + "fmax z17.s, p5/M, z17.s, z20.s\n" + "fmax z18.s, p5/M, z18.s, z20.s\n" + "fmax z19.s, p5/M, z19.s, z20.s\n" "38:" // Height 3: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -832,25 +832,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21]\n" + "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "b 44f\n" "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -874,14 +874,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -891,105 +891,105 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "47:" // Height 4: input setup done "cmp x27, #0x8\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z3.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "ld1rqh { z0.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64634328 // bfdot z8.s, z25.h, z3.h[0]\n" + ".inst 0x6462432c // bfdot z12.s, z25.h, z2.h[0]\n" + ".inst 0x64614330 // bfdot z16.s, z25.h, z1.h[0]\n" + ".inst 0x64604334 // bfdot z20.s, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" + ".inst 0x64634309 // bfdot z9.s, z24.h, z3.h[0]\n" + ".inst 0x6462430d // bfdot z13.s, z24.h, z2.h[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x64614311 // bfdot z17.s, z24.h, z1.h[0]\n" + ".inst 0x64604315 // bfdot z21.s, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6463432a // bfdot z10.s, z25.h, z3.h[0]\n" + ".inst 0x6462432e // bfdot z14.s, z25.h, z2.h[0]\n" + ".inst 0x64614332 // bfdot z18.s, z25.h, z1.h[0]\n" + ".inst 0x64604336 // bfdot z22.s, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6463430b // bfdot z11.s, z24.h, z3.h[0]\n" + ".inst 0x6462430f // bfdot z15.s, z24.h, z2.h[0]\n" + ".inst 0x64614313 // bfdot z19.s, z24.h, z1.h[0]\n" + ".inst 0x64604317 // bfdot z23.s, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x646b4328 // bfdot z8.s, z25.h, z3.h[1]\n" + ".inst 0x646a432c // bfdot z12.s, z25.h, z2.h[1]\n" + ".inst 0x64694330 // bfdot z16.s, z25.h, z1.h[1]\n" + ".inst 0x64684334 // bfdot z20.s, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x646b4309 // bfdot z9.s, z24.h, z3.h[1]\n" + ".inst 0x646a430d // bfdot z13.s, z24.h, z2.h[1]\n" + ".inst 0x64694311 // bfdot z17.s, z24.h, z1.h[1]\n" + ".inst 0x64684315 // bfdot z21.s, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x646b432a // bfdot z10.s, z25.h, z3.h[1]\n" + ".inst 0x646a432e // bfdot z14.s, z25.h, z2.h[1]\n" + ".inst 0x64694332 // bfdot z18.s, z25.h, z1.h[1]\n" + ".inst 0x64684336 // bfdot z22.s, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x646b430b // bfdot z11.s, z24.h, z3.h[1]\n" + ".inst 0x646a430f // bfdot z15.s, z24.h, z2.h[1]\n" + ".inst 0x64694313 // bfdot z19.s, z24.h, z1.h[1]\n" + ".inst 0x64684317 // bfdot z23.s, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x64734328 // bfdot z8.s, z25.h, z3.h[2]\n" + ".inst 0x6472432c // bfdot z12.s, z25.h, z2.h[2]\n" + ".inst 0x64714330 // bfdot z16.s, z25.h, z1.h[2]\n" + ".inst 0x64704334 // bfdot z20.s, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x64734309 // bfdot z9.s, z24.h, z3.h[2]\n" + ".inst 0x6472430d // bfdot z13.s, z24.h, z2.h[2]\n" + ".inst 0x64714311 // bfdot z17.s, z24.h, z1.h[2]\n" + ".inst 0x64704315 // bfdot z21.s, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6473432a // bfdot z10.s, z25.h, z3.h[2]\n" + ".inst 0x6472432e // bfdot z14.s, z25.h, z2.h[2]\n" + ".inst 0x64714332 // bfdot z18.s, z25.h, z1.h[2]\n" + ".inst 0x64704336 // bfdot z22.s, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6473430b // bfdot z11.s, z24.h, z3.h[2]\n" + ".inst 0x6472430f // bfdot z15.s, z24.h, z2.h[2]\n" + ".inst 0x64714313 // bfdot z19.s, z24.h, z1.h[2]\n" + ".inst 0x64704317 // bfdot z23.s, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x647b4328 // bfdot z8.s, z25.h, z3.h[3]\n" + ".inst 0x647a432c // bfdot z12.s, z25.h, z2.h[3]\n" + ".inst 0x64794330 // bfdot z16.s, z25.h, z1.h[3]\n" + ".inst 0x64784334 // bfdot z20.s, z25.h, z0.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x647b4309 // bfdot z9.s, z24.h, z3.h[3]\n" + ".inst 0x647a430d // bfdot z13.s, z24.h, z2.h[3]\n" + ".inst 0x64794311 // bfdot z17.s, z24.h, z1.h[3]\n" + ".inst 0x64784315 // bfdot z21.s, z24.h, z0.h[3]\n" + "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x647b432a // bfdot z10.s, z25.h, z3.h[3]\n" + ".inst 0x647a432e // bfdot z14.s, z25.h, z2.h[3]\n" + ".inst 0x64794332 // bfdot z18.s, z25.h, z1.h[3]\n" + ".inst 0x64784336 // bfdot z22.s, z25.h, z0.h[3]\n" + ".inst 0x647b430b // bfdot z11.s, z24.h, z3.h[3]\n" + ".inst 0x647a430f // bfdot z15.s, z24.h, z2.h[3]\n" + ".inst 0x64794313 // bfdot z19.s, z24.h, z1.h[3]\n" + ".inst 0x64784317 // bfdot z23.s, z24.h, z0.h[3]\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -998,95 +998,95 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "subs x27, x27, #0x2\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64604328 // bfdot z8.s, z25.h, z0.h[0]\n" + ".inst 0x6461432c // bfdot z12.s, z25.h, z1.h[0]\n" + ".inst 0x64624330 // bfdot z16.s, z25.h, z2.h[0]\n" + ".inst 0x64634334 // bfdot z20.s, z25.h, z3.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x64604309 // bfdot z9.s, z24.h, z0.h[0]\n" + ".inst 0x6461430d // bfdot z13.s, z24.h, z1.h[0]\n" + ".inst 0x64624311 // bfdot z17.s, z24.h, z2.h[0]\n" + ".inst 0x64634315 // bfdot z21.s, z24.h, z3.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" + ".inst 0x6460432a // bfdot z10.s, z25.h, z0.h[0]\n" + ".inst 0x6461432e // bfdot z14.s, z25.h, z1.h[0]\n" + ".inst 0x64624332 // bfdot z18.s, z25.h, z2.h[0]\n" + ".inst 0x64634336 // bfdot z22.s, z25.h, z3.h[0]\n" + ".inst 0x6460430b // bfdot z11.s, z24.h, z0.h[0]\n" + ".inst 0x6461430f // bfdot z15.s, z24.h, z1.h[0]\n" + ".inst 0x64624313 // bfdot z19.s, z24.h, z2.h[0]\n" + ".inst 0x64634317 // bfdot z23.s, z24.h, z3.h[0]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64684328 // bfdot z8.s, z25.h, z0.h[1]\n" + ".inst 0x6469432c // bfdot z12.s, z25.h, z1.h[1]\n" + ".inst 0x646a4330 // bfdot z16.s, z25.h, z2.h[1]\n" + ".inst 0x646b4334 // bfdot z20.s, z25.h, z3.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x64684309 // bfdot z9.s, z24.h, z0.h[1]\n" + ".inst 0x6469430d // bfdot z13.s, z24.h, z1.h[1]\n" + ".inst 0x646a4311 // bfdot z17.s, z24.h, z2.h[1]\n" + ".inst 0x646b4315 // bfdot z21.s, z24.h, z3.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" + ".inst 0x6468432a // bfdot z10.s, z25.h, z0.h[1]\n" + ".inst 0x6469432e // bfdot z14.s, z25.h, z1.h[1]\n" + ".inst 0x646a4332 // bfdot z18.s, z25.h, z2.h[1]\n" + ".inst 0x646b4336 // bfdot z22.s, z25.h, z3.h[1]\n" + ".inst 0x6468430b // bfdot z11.s, z24.h, z0.h[1]\n" + ".inst 0x6469430f // bfdot z15.s, z24.h, z1.h[1]\n" + ".inst 0x646a4313 // bfdot z19.s, z24.h, z2.h[1]\n" + ".inst 0x646b4317 // bfdot z23.s, z24.h, z3.h[1]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64704328 // bfdot z8.s, z25.h, z0.h[2]\n" + ".inst 0x6471432c // bfdot z12.s, z25.h, z1.h[2]\n" + ".inst 0x64724330 // bfdot z16.s, z25.h, z2.h[2]\n" + ".inst 0x64734334 // bfdot z20.s, z25.h, z3.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x2\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x64704309 // bfdot z9.s, z24.h, z0.h[2]\n" + ".inst 0x6471430d // bfdot z13.s, z24.h, z1.h[2]\n" + ".inst 0x64724311 // bfdot z17.s, z24.h, z2.h[2]\n" + ".inst 0x64734315 // bfdot z21.s, z24.h, z3.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" + ".inst 0x6470432a // bfdot z10.s, z25.h, z0.h[2]\n" + ".inst 0x6471432e // bfdot z14.s, z25.h, z1.h[2]\n" + ".inst 0x64724332 // bfdot z18.s, z25.h, z2.h[2]\n" + ".inst 0x64734336 // bfdot z22.s, z25.h, z3.h[2]\n" + ".inst 0x6470430b // bfdot z11.s, z24.h, z0.h[2]\n" + ".inst 0x6471430f // bfdot z15.s, z24.h, z1.h[2]\n" + ".inst 0x64724313 // bfdot z19.s, z24.h, z2.h[2]\n" + ".inst 0x64734317 // bfdot z23.s, z24.h, z3.h[2]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64784328 // bfdot z8.s, z25.h, z0.h[3]\n" + ".inst 0x6479432c // bfdot z12.s, z25.h, z1.h[3]\n" + ".inst 0x647a4330 // bfdot z16.s, z25.h, z2.h[3]\n" + ".inst 0x647b4334 // bfdot z20.s, z25.h, z3.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x64784309 // bfdot z9.s, z24.h, z0.h[3]\n" + ".inst 0x6479430d // bfdot z13.s, z24.h, z1.h[3]\n" + ".inst 0x647a4311 // bfdot z17.s, z24.h, z2.h[3]\n" + ".inst 0x647b4315 // bfdot z21.s, z24.h, z3.h[3]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" + ".inst 0x6478432a // bfdot z10.s, z25.h, z0.h[3]\n" + ".inst 0x6479432e // bfdot z14.s, z25.h, z1.h[3]\n" + ".inst 0x647a4332 // bfdot z18.s, z25.h, z2.h[3]\n" + ".inst 0x647b4336 // bfdot z22.s, z25.h, z3.h[3]\n" + ".inst 0x6478430b // bfdot z11.s, z24.h, z0.h[3]\n" + ".inst 0x6479430f // bfdot z15.s, z24.h, z1.h[3]\n" + ".inst 0x647a4313 // bfdot z19.s, z24.h, z2.h[3]\n" + ".inst 0x647b4317 // bfdot z23.s, z24.h, z3.h[3]\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1098,41 +1098,41 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z15.s, p5/M, z15.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmin z20.s, p5/M, z20.s, z25.s\n" + "fmin z21.s, p5/M, z21.s, z25.s\n" + "fmin z22.s, p5/M, z22.s, z25.s\n" + "fmin z23.s, p5/M, z23.s, z25.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z15.s, p5/M, z15.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" + "fmax z20.s, p5/M, z20.s, z24.s\n" + "fmax z21.s, p5/M, z21.s, z24.s\n" + "fmax z22.s, p5/M, z22.s, z24.s\n" + "fmax z23.s, p5/M, z23.s, z24.s\n" "51:" // Height 4: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -1196,30 +1196,30 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1247,15 +1247,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1266,124 +1266,124 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "60:" // Height 5: input setup done "cmp x27, #0x8\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z4.h }, p0/Z, [x26]\n" + "ld1rqh { z3.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "ld1rqh { z0.h }, p0/Z, [x22]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + ".inst 0x646443a8 // bfdot z8.s, z29.h, z4.h[0]\n" + ".inst 0x646343ac // bfdot z12.s, z29.h, z3.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n" + ".inst 0x646143b4 // bfdot z20.s, z29.h, z1.h[0]\n" "add x25, x25, #0x10\n" - ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646043b8 // bfdot z24.s, z29.h, z0.h[0]\n" + ".inst 0x64644389 // bfdot z9.s, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" + ".inst 0x6463438d // bfdot z13.s, z28.h, z3.h[0]\n" + ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x64614395 // bfdot z21.s, z28.h, z1.h[0]\n" + ".inst 0x64604399 // bfdot z25.s, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x646443aa // bfdot z10.s, z29.h, z4.h[0]\n" + ".inst 0x646343ae // bfdot z14.s, z29.h, z3.h[0]\n" + ".inst 0x646243b2 // bfdot z18.s, z29.h, z2.h[0]\n" + ".inst 0x646143b6 // bfdot z22.s, z29.h, z1.h[0]\n" + ".inst 0x646043ba // bfdot z26.s, z29.h, z0.h[0]\n" + ".inst 0x6464438b // bfdot z11.s, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6463438f // bfdot z15.s, z28.h, z3.h[0]\n" + ".inst 0x64624393 // bfdot z19.s, z28.h, z2.h[0]\n" + ".inst 0x64614397 // bfdot z23.s, z28.h, z1.h[0]\n" + ".inst 0x6460439b // bfdot z27.s, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x646c43a8 // bfdot z8.s, z29.h, z4.h[1]\n" + ".inst 0x646b43ac // bfdot z12.s, z29.h, z3.h[1]\n" + ".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n" + ".inst 0x646943b4 // bfdot z20.s, z29.h, z1.h[1]\n" + ".inst 0x646843b8 // bfdot z24.s, z29.h, z0.h[1]\n" + ".inst 0x646c4389 // bfdot z9.s, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x646b438d // bfdot z13.s, z28.h, z3.h[1]\n" + ".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n" + ".inst 0x64694395 // bfdot z21.s, z28.h, z1.h[1]\n" + ".inst 0x64684399 // bfdot z25.s, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + ".inst 0x646c43aa // bfdot z10.s, z29.h, z4.h[1]\n" + ".inst 0x646b43ae // bfdot z14.s, z29.h, z3.h[1]\n" + ".inst 0x646a43b2 // bfdot z18.s, z29.h, z2.h[1]\n" + ".inst 0x646943b6 // bfdot z22.s, z29.h, z1.h[1]\n" + ".inst 0x646843ba // bfdot z26.s, z29.h, z0.h[1]\n" + ".inst 0x646c438b // bfdot z11.s, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x646b438f // bfdot z15.s, z28.h, z3.h[1]\n" + ".inst 0x646a4393 // bfdot z19.s, z28.h, z2.h[1]\n" + ".inst 0x64694397 // bfdot z23.s, z28.h, z1.h[1]\n" + ".inst 0x6468439b // bfdot z27.s, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x647443a8 // bfdot z8.s, z29.h, z4.h[2]\n" + ".inst 0x647343ac // bfdot z12.s, z29.h, z3.h[2]\n" + ".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n" + ".inst 0x647143b4 // bfdot z20.s, z29.h, z1.h[2]\n" + ".inst 0x647043b8 // bfdot z24.s, z29.h, z0.h[2]\n" + ".inst 0x64744389 // bfdot z9.s, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6473438d // bfdot z13.s, z28.h, z3.h[2]\n" + ".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n" + ".inst 0x64714395 // bfdot z21.s, z28.h, z1.h[2]\n" + ".inst 0x64704399 // bfdot z25.s, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x647443aa // bfdot z10.s, z29.h, z4.h[2]\n" + ".inst 0x647343ae // bfdot z14.s, z29.h, z3.h[2]\n" + ".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n" + ".inst 0x647143b6 // bfdot z22.s, z29.h, z1.h[2]\n" + ".inst 0x647043ba // bfdot z26.s, z29.h, z0.h[2]\n" + ".inst 0x6474438b // bfdot z11.s, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6473438f // bfdot z15.s, z28.h, z3.h[2]\n" + ".inst 0x64724393 // bfdot z19.s, z28.h, z2.h[2]\n" + ".inst 0x64714397 // bfdot z23.s, z28.h, z1.h[2]\n" + ".inst 0x6470439b // bfdot z27.s, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x647c43a8 // bfdot z8.s, z29.h, z4.h[3]\n" + ".inst 0x647b43ac // bfdot z12.s, z29.h, z3.h[3]\n" + ".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n" + ".inst 0x647943b4 // bfdot z20.s, z29.h, z1.h[3]\n" + ".inst 0x647843b8 // bfdot z24.s, z29.h, z0.h[3]\n" + ".inst 0x647c4389 // bfdot z9.s, z28.h, z4.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x647b438d // bfdot z13.s, z28.h, z3.h[3]\n" + ".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n" + ".inst 0x64794395 // bfdot z21.s, z28.h, z1.h[3]\n" + ".inst 0x64784399 // bfdot z25.s, z28.h, z0.h[3]\n" + "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x647c43aa // bfdot z10.s, z29.h, z4.h[3]\n" + ".inst 0x647b43ae // bfdot z14.s, z29.h, z3.h[3]\n" + ".inst 0x647a43b2 // bfdot z18.s, z29.h, z2.h[3]\n" + ".inst 0x647943b6 // bfdot z22.s, z29.h, z1.h[3]\n" + ".inst 0x647843ba // bfdot z26.s, z29.h, z0.h[3]\n" + ".inst 0x647c438b // bfdot z11.s, z28.h, z4.h[3]\n" + ".inst 0x647b438f // bfdot z15.s, z28.h, z3.h[3]\n" + ".inst 0x647a4393 // bfdot z19.s, z28.h, z2.h[3]\n" + ".inst 0x64794397 // bfdot z23.s, z28.h, z1.h[3]\n" + ".inst 0x6478439b // bfdot z27.s, z28.h, z0.h[3]\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1393,111 +1393,111 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + ".inst 0x646043a8 // bfdot z8.s, z29.h, z0.h[0]\n" + ".inst 0x646143ac // bfdot z12.s, z29.h, z1.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646243b0 // bfdot z16.s, z29.h, z2.h[0]\n" + ".inst 0x646343b4 // bfdot z20.s, z29.h, z3.h[0]\n" + ".inst 0x646443b8 // bfdot z24.s, z29.h, z4.h[0]\n" + ".inst 0x64604389 // bfdot z9.s, z28.h, z0.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6461438d // bfdot z13.s, z28.h, z1.h[0]\n" + ".inst 0x64624391 // bfdot z17.s, z28.h, z2.h[0]\n" + ".inst 0x64634395 // bfdot z21.s, z28.h, z3.h[0]\n" + ".inst 0x64644399 // bfdot z25.s, z28.h, z4.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" + ".inst 0x646043aa // bfdot z10.s, z29.h, z0.h[0]\n" + ".inst 0x646143ae // bfdot z14.s, z29.h, z1.h[0]\n" + ".inst 0x646243b2 // bfdot z18.s, z29.h, z2.h[0]\n" + ".inst 0x646343b6 // bfdot z22.s, z29.h, z3.h[0]\n" + ".inst 0x646443ba // bfdot z26.s, z29.h, z4.h[0]\n" + ".inst 0x6460438b // bfdot z11.s, z28.h, z0.h[0]\n" + ".inst 0x6461438f // bfdot z15.s, z28.h, z1.h[0]\n" + ".inst 0x64624393 // bfdot z19.s, z28.h, z2.h[0]\n" + ".inst 0x64634397 // bfdot z23.s, z28.h, z3.h[0]\n" + ".inst 0x6464439b // bfdot z27.s, z28.h, z4.h[0]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646843a8 // bfdot z8.s, z29.h, z0.h[1]\n" + ".inst 0x646943ac // bfdot z12.s, z29.h, z1.h[1]\n" + ".inst 0x646a43b0 // bfdot z16.s, z29.h, z2.h[1]\n" + ".inst 0x646b43b4 // bfdot z20.s, z29.h, z3.h[1]\n" "subs x27, x27, #0x2\n" - ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x646c43b8 // bfdot z24.s, z29.h, z4.h[1]\n" + ".inst 0x64684389 // bfdot z9.s, z28.h, z0.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6469438d // bfdot z13.s, z28.h, z1.h[1]\n" + ".inst 0x646a4391 // bfdot z17.s, z28.h, z2.h[1]\n" + ".inst 0x646b4395 // bfdot z21.s, z28.h, z3.h[1]\n" + ".inst 0x646c4399 // bfdot z25.s, z28.h, z4.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" + ".inst 0x646843aa // bfdot z10.s, z29.h, z0.h[1]\n" + ".inst 0x646943ae // bfdot z14.s, z29.h, z1.h[1]\n" + ".inst 0x646a43b2 // bfdot z18.s, z29.h, z2.h[1]\n" + ".inst 0x646b43b6 // bfdot z22.s, z29.h, z3.h[1]\n" + ".inst 0x646c43ba // bfdot z26.s, z29.h, z4.h[1]\n" + ".inst 0x6468438b // bfdot z11.s, z28.h, z0.h[1]\n" + ".inst 0x6469438f // bfdot z15.s, z28.h, z1.h[1]\n" + ".inst 0x646a4393 // bfdot z19.s, z28.h, z2.h[1]\n" + ".inst 0x646b4397 // bfdot z23.s, z28.h, z3.h[1]\n" + ".inst 0x646c439b // bfdot z27.s, z28.h, z4.h[1]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647043a8 // bfdot z8.s, z29.h, z0.h[2]\n" + ".inst 0x647143ac // bfdot z12.s, z29.h, z1.h[2]\n" + ".inst 0x647243b0 // bfdot z16.s, z29.h, z2.h[2]\n" + ".inst 0x647343b4 // bfdot z20.s, z29.h, z3.h[2]\n" "subs x27, x27, #0x2\n" - ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x647443b8 // bfdot z24.s, z29.h, z4.h[2]\n" + ".inst 0x64704389 // bfdot z9.s, z28.h, z0.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6471438d // bfdot z13.s, z28.h, z1.h[2]\n" + ".inst 0x64724391 // bfdot z17.s, z28.h, z2.h[2]\n" + ".inst 0x64734395 // bfdot z21.s, z28.h, z3.h[2]\n" + ".inst 0x64744399 // bfdot z25.s, z28.h, z4.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" + ".inst 0x647043aa // bfdot z10.s, z29.h, z0.h[2]\n" + ".inst 0x647143ae // bfdot z14.s, z29.h, z1.h[2]\n" + ".inst 0x647243b2 // bfdot z18.s, z29.h, z2.h[2]\n" + ".inst 0x647343b6 // bfdot z22.s, z29.h, z3.h[2]\n" + ".inst 0x647443ba // bfdot z26.s, z29.h, z4.h[2]\n" + ".inst 0x6470438b // bfdot z11.s, z28.h, z0.h[2]\n" + ".inst 0x6471438f // bfdot z15.s, z28.h, z1.h[2]\n" + ".inst 0x64724393 // bfdot z19.s, z28.h, z2.h[2]\n" + ".inst 0x64734397 // bfdot z23.s, z28.h, z3.h[2]\n" + ".inst 0x6474439b // bfdot z27.s, z28.h, z4.h[2]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647843a8 // bfdot z8.s, z29.h, z0.h[3]\n" + ".inst 0x647943ac // bfdot z12.s, z29.h, z1.h[3]\n" + ".inst 0x647a43b0 // bfdot z16.s, z29.h, z2.h[3]\n" + ".inst 0x647b43b4 // bfdot z20.s, z29.h, z3.h[3]\n" + ".inst 0x647c43b8 // bfdot z24.s, z29.h, z4.h[3]\n" + ".inst 0x64784389 // bfdot z9.s, z28.h, z0.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6479438d // bfdot z13.s, z28.h, z1.h[3]\n" + ".inst 0x647a4391 // bfdot z17.s, z28.h, z2.h[3]\n" + ".inst 0x647b4395 // bfdot z21.s, z28.h, z3.h[3]\n" + ".inst 0x647c4399 // bfdot z25.s, z28.h, z4.h[3]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" + ".inst 0x647843aa // bfdot z10.s, z29.h, z0.h[3]\n" + ".inst 0x647943ae // bfdot z14.s, z29.h, z1.h[3]\n" + ".inst 0x647a43b2 // bfdot z18.s, z29.h, z2.h[3]\n" + ".inst 0x647b43b6 // bfdot z22.s, z29.h, z3.h[3]\n" + ".inst 0x647c43ba // bfdot z26.s, z29.h, z4.h[3]\n" + ".inst 0x6478438b // bfdot z11.s, z28.h, z0.h[3]\n" + ".inst 0x6479438f // bfdot z15.s, z28.h, z1.h[3]\n" + ".inst 0x647a4393 // bfdot z19.s, z28.h, z2.h[3]\n" + ".inst 0x647b4397 // bfdot z23.s, z28.h, z3.h[3]\n" + ".inst 0x647c439b // bfdot z27.s, z28.h, z4.h[3]\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1510,49 +1510,49 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z29.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "ld1rw { z28.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z29.s\n" + "fmin z9.s, p5/M, z9.s, z29.s\n" + "fmin z10.s, p5/M, z10.s, z29.s\n" + "fmin z11.s, p5/M, z11.s, z29.s\n" + "fmin z12.s, p5/M, z12.s, z29.s\n" + "fmin z13.s, p5/M, z13.s, z29.s\n" + "fmin z14.s, p5/M, z14.s, z29.s\n" + "fmin z15.s, p5/M, z15.s, z29.s\n" + "fmin z16.s, p5/M, z16.s, z29.s\n" + "fmin z17.s, p5/M, z17.s, z29.s\n" + "fmin z18.s, p5/M, z18.s, z29.s\n" + "fmin z19.s, p5/M, z19.s, z29.s\n" + "fmin z20.s, p5/M, z20.s, z29.s\n" + "fmin z21.s, p5/M, z21.s, z29.s\n" + "fmin z22.s, p5/M, z22.s, z29.s\n" + "fmin z23.s, p5/M, z23.s, z29.s\n" + "fmin z24.s, p5/M, z24.s, z29.s\n" + "fmin z25.s, p5/M, z25.s, z29.s\n" + "fmin z26.s, p5/M, z26.s, z29.s\n" + "fmin z27.s, p5/M, z27.s, z29.s\n" + "fmax z8.s, p5/M, z8.s, z28.s\n" + "fmax z9.s, p5/M, z9.s, z28.s\n" + "fmax z10.s, p5/M, z10.s, z28.s\n" + "fmax z11.s, p5/M, z11.s, z28.s\n" + "fmax z12.s, p5/M, z12.s, z28.s\n" + "fmax z13.s, p5/M, z13.s, z28.s\n" + "fmax z14.s, p5/M, z14.s, z28.s\n" + "fmax z15.s, p5/M, z15.s, z28.s\n" + "fmax z16.s, p5/M, z16.s, z28.s\n" + "fmax z17.s, p5/M, z17.s, z28.s\n" + "fmax z18.s, p5/M, z18.s, z28.s\n" + "fmax z19.s, p5/M, z19.s, z28.s\n" + "fmax z20.s, p5/M, z20.s, z28.s\n" + "fmax z21.s, p5/M, z21.s, z28.s\n" + "fmax z22.s, p5/M, z22.s, z28.s\n" + "fmax z23.s, p5/M, z23.s, z28.s\n" + "fmax z24.s, p5/M, z24.s, z28.s\n" + "fmax z25.s, p5/M, z25.s, z28.s\n" + "fmax z26.s, p5/M, z26.s, z28.s\n" + "fmax z27.s, p5/M, z27.s, z28.s\n" "64:" // Height 5: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -1627,35 +1627,35 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" + "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" - "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1687,16 +1687,16 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1708,143 +1708,143 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "73:" // Height 6: input setup done "cmp x27, #0x8\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z7.h }, p0/Z, [x26]\n" + "ld1rqh { z6.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z5.h }, p0/Z, [x24]\n" + "ld1rqh { z4.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1rqh { z5.h }, p0/Z, [x21]\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "ld1rqh { z2.h }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x64674028 // bfdot z8.s, z1.h, z7.h[0]\n" + ".inst 0x6466402c // bfdot z12.s, z1.h, z6.h[0]\n" + ".inst 0x64654030 // bfdot z16.s, z1.h, z5.h[0]\n" + ".inst 0x64644034 // bfdot z20.s, z1.h, z4.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x64634038 // bfdot z24.s, z1.h, z3.h[0]\n" + ".inst 0x6462403c // bfdot z28.s, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" "add x21, x21, #0x10\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" - ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" - ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" - ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x64674009 // bfdot z9.s, z0.h, z7.h[0]\n" + ".inst 0x6466400d // bfdot z13.s, z0.h, z6.h[0]\n" + ".inst 0x64654011 // bfdot z17.s, z0.h, z5.h[0]\n" + ".inst 0x64644015 // bfdot z21.s, z0.h, z4.h[0]\n" + ".inst 0x64634019 // bfdot z25.s, z0.h, z3.h[0]\n" + ".inst 0x6462401d // bfdot z29.s, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6467402a // bfdot z10.s, z1.h, z7.h[0]\n" + ".inst 0x6466402e // bfdot z14.s, z1.h, z6.h[0]\n" + ".inst 0x64654032 // bfdot z18.s, z1.h, z5.h[0]\n" + ".inst 0x64644036 // bfdot z22.s, z1.h, z4.h[0]\n" + ".inst 0x6463403a // bfdot z26.s, z1.h, z3.h[0]\n" + ".inst 0x6462403e // bfdot z30.s, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6467400b // bfdot z11.s, z0.h, z7.h[0]\n" + ".inst 0x6466400f // bfdot z15.s, z0.h, z6.h[0]\n" + ".inst 0x64654013 // bfdot z19.s, z0.h, z5.h[0]\n" + ".inst 0x64644017 // bfdot z23.s, z0.h, z4.h[0]\n" + ".inst 0x6463401b // bfdot z27.s, z0.h, z3.h[0]\n" + ".inst 0x6462401f // bfdot z31.s, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x646f4028 // bfdot z8.s, z1.h, z7.h[1]\n" + ".inst 0x646e402c // bfdot z12.s, z1.h, z6.h[1]\n" + ".inst 0x646d4030 // bfdot z16.s, z1.h, z5.h[1]\n" + ".inst 0x646c4034 // bfdot z20.s, z1.h, z4.h[1]\n" + ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n" + ".inst 0x646a403c // bfdot z28.s, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x646f4009 // bfdot z9.s, z0.h, z7.h[1]\n" + ".inst 0x646e400d // bfdot z13.s, z0.h, z6.h[1]\n" + ".inst 0x646d4011 // bfdot z17.s, z0.h, z5.h[1]\n" + ".inst 0x646c4015 // bfdot z21.s, z0.h, z4.h[1]\n" + ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n" + ".inst 0x646a401d // bfdot z29.s, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" - ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" - ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" - ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" - ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" - ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" - ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" - ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" + ".inst 0x646f402a // bfdot z10.s, z1.h, z7.h[1]\n" + ".inst 0x646e402e // bfdot z14.s, z1.h, z6.h[1]\n" + ".inst 0x646d4032 // bfdot z18.s, z1.h, z5.h[1]\n" + ".inst 0x646c4036 // bfdot z22.s, z1.h, z4.h[1]\n" + ".inst 0x646b403a // bfdot z26.s, z1.h, z3.h[1]\n" + ".inst 0x646a403e // bfdot z30.s, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x646f400b // bfdot z11.s, z0.h, z7.h[1]\n" + ".inst 0x646e400f // bfdot z15.s, z0.h, z6.h[1]\n" + ".inst 0x646d4013 // bfdot z19.s, z0.h, z5.h[1]\n" + ".inst 0x646c4017 // bfdot z23.s, z0.h, z4.h[1]\n" + ".inst 0x646b401b // bfdot z27.s, z0.h, z3.h[1]\n" + ".inst 0x646a401f // bfdot z31.s, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x64774028 // bfdot z8.s, z1.h, z7.h[2]\n" + ".inst 0x6476402c // bfdot z12.s, z1.h, z6.h[2]\n" + ".inst 0x64754030 // bfdot z16.s, z1.h, z5.h[2]\n" + ".inst 0x64744034 // bfdot z20.s, z1.h, z4.h[2]\n" + ".inst 0x64734038 // bfdot z24.s, z1.h, z3.h[2]\n" + ".inst 0x6472403c // bfdot z28.s, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x64774009 // bfdot z9.s, z0.h, z7.h[2]\n" + ".inst 0x6476400d // bfdot z13.s, z0.h, z6.h[2]\n" + ".inst 0x64754011 // bfdot z17.s, z0.h, z5.h[2]\n" + ".inst 0x64744015 // bfdot z21.s, z0.h, z4.h[2]\n" + ".inst 0x64734019 // bfdot z25.s, z0.h, z3.h[2]\n" + ".inst 0x6472401d // bfdot z29.s, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6477402a // bfdot z10.s, z1.h, z7.h[2]\n" + ".inst 0x6476402e // bfdot z14.s, z1.h, z6.h[2]\n" + ".inst 0x64754032 // bfdot z18.s, z1.h, z5.h[2]\n" + ".inst 0x64744036 // bfdot z22.s, z1.h, z4.h[2]\n" + ".inst 0x6473403a // bfdot z26.s, z1.h, z3.h[2]\n" + ".inst 0x6472403e // bfdot z30.s, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6477400b // bfdot z11.s, z0.h, z7.h[2]\n" + ".inst 0x6476400f // bfdot z15.s, z0.h, z6.h[2]\n" + ".inst 0x64754013 // bfdot z19.s, z0.h, z5.h[2]\n" + ".inst 0x64744017 // bfdot z23.s, z0.h, z4.h[2]\n" + ".inst 0x6473401b // bfdot z27.s, z0.h, z3.h[2]\n" + ".inst 0x6472401f // bfdot z31.s, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x647f4028 // bfdot z8.s, z1.h, z7.h[3]\n" + ".inst 0x647e402c // bfdot z12.s, z1.h, z6.h[3]\n" + ".inst 0x647d4030 // bfdot z16.s, z1.h, z5.h[3]\n" + ".inst 0x647c4034 // bfdot z20.s, z1.h, z4.h[3]\n" + ".inst 0x647b4038 // bfdot z24.s, z1.h, z3.h[3]\n" + ".inst 0x647a403c // bfdot z28.s, z1.h, z2.h[3]\n" + "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x647f4009 // bfdot z9.s, z0.h, z7.h[3]\n" + ".inst 0x647e400d // bfdot z13.s, z0.h, z6.h[3]\n" + ".inst 0x647d4011 // bfdot z17.s, z0.h, z5.h[3]\n" + ".inst 0x647c4015 // bfdot z21.s, z0.h, z4.h[3]\n" + ".inst 0x647b4019 // bfdot z25.s, z0.h, z3.h[3]\n" + ".inst 0x647a401d // bfdot z29.s, z0.h, z2.h[3]\n" + "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x647f402a // bfdot z10.s, z1.h, z7.h[3]\n" + ".inst 0x647e402e // bfdot z14.s, z1.h, z6.h[3]\n" + ".inst 0x647d4032 // bfdot z18.s, z1.h, z5.h[3]\n" + ".inst 0x647c4036 // bfdot z22.s, z1.h, z4.h[3]\n" + ".inst 0x647b403a // bfdot z26.s, z1.h, z3.h[3]\n" + ".inst 0x647a403e // bfdot z30.s, z1.h, z2.h[3]\n" + ".inst 0x647f400b // bfdot z11.s, z0.h, z7.h[3]\n" + ".inst 0x647e400f // bfdot z15.s, z0.h, z6.h[3]\n" + ".inst 0x647d4013 // bfdot z19.s, z0.h, z5.h[3]\n" + ".inst 0x647c4017 // bfdot z23.s, z0.h, z4.h[3]\n" + ".inst 0x647b401b // bfdot z27.s, z0.h, z3.h[3]\n" + ".inst 0x647a401f // bfdot z31.s, z0.h, z2.h[3]\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1855,127 +1855,127 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" "ld1rqh { z5.h }, p0/Z, [x21]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n" - ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n" - ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n" - ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n" - ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n" - ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n" - ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n" - ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n" - ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n" - ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n" - ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n" + ".inst 0x646140ec // bfdot z12.s, z7.h, z1.h[0]\n" + ".inst 0x646240f0 // bfdot z16.s, z7.h, z2.h[0]\n" + ".inst 0x646340f4 // bfdot z20.s, z7.h, z3.h[0]\n" + ".inst 0x646440f8 // bfdot z24.s, z7.h, z4.h[0]\n" + ".inst 0x646540fc // bfdot z28.s, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646040c9 // bfdot z9.s, z6.h, z0.h[0]\n" + ".inst 0x646140cd // bfdot z13.s, z6.h, z1.h[0]\n" + ".inst 0x646240d1 // bfdot z17.s, z6.h, z2.h[0]\n" + ".inst 0x646340d5 // bfdot z21.s, z6.h, z3.h[0]\n" + ".inst 0x646440d9 // bfdot z25.s, z6.h, z4.h[0]\n" + ".inst 0x646540dd // bfdot z29.s, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n" - ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n" - ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n" - ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n" - ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n" - ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n" - ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n" - ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n" - ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n" - ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n" + ".inst 0x646040ea // bfdot z10.s, z7.h, z0.h[0]\n" + ".inst 0x646140ee // bfdot z14.s, z7.h, z1.h[0]\n" + ".inst 0x646240f2 // bfdot z18.s, z7.h, z2.h[0]\n" + ".inst 0x646340f6 // bfdot z22.s, z7.h, z3.h[0]\n" + ".inst 0x646440fa // bfdot z26.s, z7.h, z4.h[0]\n" + ".inst 0x646540fe // bfdot z30.s, z7.h, z5.h[0]\n" + ".inst 0x646040cb // bfdot z11.s, z6.h, z0.h[0]\n" + ".inst 0x646140cf // bfdot z15.s, z6.h, z1.h[0]\n" + ".inst 0x646240d3 // bfdot z19.s, z6.h, z2.h[0]\n" + ".inst 0x646340d7 // bfdot z23.s, z6.h, z3.h[0]\n" + ".inst 0x646440db // bfdot z27.s, z6.h, z4.h[0]\n" + ".inst 0x646540df // bfdot z31.s, z6.h, z5.h[0]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n" - ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x646840e8 // bfdot z8.s, z7.h, z0.h[1]\n" + ".inst 0x646940ec // bfdot z12.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f0 // bfdot z16.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f4 // bfdot z20.s, z7.h, z3.h[1]\n" "subs x27, x27, #0x2\n" - ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n" - ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n" - ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n" - ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n" - ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x646c40f8 // bfdot z24.s, z7.h, z4.h[1]\n" + ".inst 0x646d40fc // bfdot z28.s, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x646840c9 // bfdot z9.s, z6.h, z0.h[1]\n" + ".inst 0x646940cd // bfdot z13.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d5 // bfdot z21.s, z6.h, z3.h[1]\n" + ".inst 0x646c40d9 // bfdot z25.s, z6.h, z4.h[1]\n" + ".inst 0x646d40dd // bfdot z29.s, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n" - ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n" - ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n" - ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n" - ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n" - ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n" - ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n" - ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n" - ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n" - ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n" + ".inst 0x646840ea // bfdot z10.s, z7.h, z0.h[1]\n" + ".inst 0x646940ee // bfdot z14.s, z7.h, z1.h[1]\n" + ".inst 0x646a40f2 // bfdot z18.s, z7.h, z2.h[1]\n" + ".inst 0x646b40f6 // bfdot z22.s, z7.h, z3.h[1]\n" + ".inst 0x646c40fa // bfdot z26.s, z7.h, z4.h[1]\n" + ".inst 0x646d40fe // bfdot z30.s, z7.h, z5.h[1]\n" + ".inst 0x646840cb // bfdot z11.s, z6.h, z0.h[1]\n" + ".inst 0x646940cf // bfdot z15.s, z6.h, z1.h[1]\n" + ".inst 0x646a40d3 // bfdot z19.s, z6.h, z2.h[1]\n" + ".inst 0x646b40d7 // bfdot z23.s, z6.h, z3.h[1]\n" + ".inst 0x646c40db // bfdot z27.s, z6.h, z4.h[1]\n" + ".inst 0x646d40df // bfdot z31.s, z6.h, z5.h[1]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n" - ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647040e8 // bfdot z8.s, z7.h, z0.h[2]\n" + ".inst 0x647140ec // bfdot z12.s, z7.h, z1.h[2]\n" + ".inst 0x647240f0 // bfdot z16.s, z7.h, z2.h[2]\n" + ".inst 0x647340f4 // bfdot z20.s, z7.h, z3.h[2]\n" "subs x27, x27, #0x2\n" - ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n" - ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n" - ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n" - ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n" - ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n" - ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n" - ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x647440f8 // bfdot z24.s, z7.h, z4.h[2]\n" + ".inst 0x647540fc // bfdot z28.s, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x647040c9 // bfdot z9.s, z6.h, z0.h[2]\n" + ".inst 0x647140cd // bfdot z13.s, z6.h, z1.h[2]\n" + ".inst 0x647240d1 // bfdot z17.s, z6.h, z2.h[2]\n" + ".inst 0x647340d5 // bfdot z21.s, z6.h, z3.h[2]\n" + ".inst 0x647440d9 // bfdot z25.s, z6.h, z4.h[2]\n" + ".inst 0x647540dd // bfdot z29.s, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n" - ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n" - ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n" - ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n" - ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n" - ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n" - ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n" - ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n" - ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n" - ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n" - ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n" - ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n" + ".inst 0x647040ea // bfdot z10.s, z7.h, z0.h[2]\n" + ".inst 0x647140ee // bfdot z14.s, z7.h, z1.h[2]\n" + ".inst 0x647240f2 // bfdot z18.s, z7.h, z2.h[2]\n" + ".inst 0x647340f6 // bfdot z22.s, z7.h, z3.h[2]\n" + ".inst 0x647440fa // bfdot z26.s, z7.h, z4.h[2]\n" + ".inst 0x647540fe // bfdot z30.s, z7.h, z5.h[2]\n" + ".inst 0x647040cb // bfdot z11.s, z6.h, z0.h[2]\n" + ".inst 0x647140cf // bfdot z15.s, z6.h, z1.h[2]\n" + ".inst 0x647240d3 // bfdot z19.s, z6.h, z2.h[2]\n" + ".inst 0x647340d7 // bfdot z23.s, z6.h, z3.h[2]\n" + ".inst 0x647440db // bfdot z27.s, z6.h, z4.h[2]\n" + ".inst 0x647540df // bfdot z31.s, z6.h, z5.h[2]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n" - ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n" - ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n" - ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n" - ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n" - ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n" - ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x647840e8 // bfdot z8.s, z7.h, z0.h[3]\n" + ".inst 0x647940ec // bfdot z12.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f0 // bfdot z16.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f4 // bfdot z20.s, z7.h, z3.h[3]\n" + ".inst 0x647c40f8 // bfdot z24.s, z7.h, z4.h[3]\n" + ".inst 0x647d40fc // bfdot z28.s, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x647840c9 // bfdot z9.s, z6.h, z0.h[3]\n" + ".inst 0x647940cd // bfdot z13.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d1 // bfdot z17.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d5 // bfdot z21.s, z6.h, z3.h[3]\n" + ".inst 0x647c40d9 // bfdot z25.s, z6.h, z4.h[3]\n" + ".inst 0x647d40dd // bfdot z29.s, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n" - ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n" - ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n" - ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n" - ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n" - ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n" - ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n" - ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n" - ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n" - ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n" - ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n" - ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n" + ".inst 0x647840ea // bfdot z10.s, z7.h, z0.h[3]\n" + ".inst 0x647940ee // bfdot z14.s, z7.h, z1.h[3]\n" + ".inst 0x647a40f2 // bfdot z18.s, z7.h, z2.h[3]\n" + ".inst 0x647b40f6 // bfdot z22.s, z7.h, z3.h[3]\n" + ".inst 0x647c40fa // bfdot z26.s, z7.h, z4.h[3]\n" + ".inst 0x647d40fe // bfdot z30.s, z7.h, z5.h[3]\n" + ".inst 0x647840cb // bfdot z11.s, z6.h, z0.h[3]\n" + ".inst 0x647940cf // bfdot z15.s, z6.h, z1.h[3]\n" + ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n" + ".inst 0x647b40d7 // bfdot z23.s, z6.h, z3.h[3]\n" + ".inst 0x647c40db // bfdot z27.s, z6.h, z4.h[3]\n" + ".inst 0x647d40df // bfdot z31.s, z6.h, z5.h[3]\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2082,7 +2082,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -2090,4 +2089,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp index b8d237ff23..223d8a78de 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -75,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -100,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp index 9bb67f18d2..74e2d267bc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp @@ -133,16 +133,16 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 5f\n" "3:" // Height 1: no bias "tbz %x[flags], #0, 4f\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z16.d, z12.d\n" + "zip2 z12.d, z16.d, z12.d\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 5f\n" @@ -160,11 +160,11 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -176,86 +176,86 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "ble 10f\n" "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqh { z20.h }, p0/Z, [x26]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n" + ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n" + ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6470e68a // bfmmla z10.s, z20.h, z16.h\n" + ".inst 0x6471e68e // bfmmla z14.s, z20.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n" + ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n" "add x26, x26, #0x10\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x4\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "addvl x10, x10, #8\n" "ble 11f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n" + ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n" + ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n" + ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6471e42b // bfmmla z11.s, z1.h, z17.h\n" + ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n" "addvl x10, x10, #8\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -268,17 +268,17 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "uzp1 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z21.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z21.s\n" + "fmin z9.s, p5/M, z9.s, z21.s\n" + "fmin z10.s, p5/M, z10.s, z21.s\n" + "fmin z11.s, p5/M, z11.s, z21.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -322,21 +322,21 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x9, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 18f\n" @@ -354,12 +354,12 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -367,95 +367,95 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "21:" // Height 2: input setup done "cmp x27, #0x8\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqh { z20.h }, p0/Z, [x26]\n" + "ld1rqh { z19.h }, p0/Z, [x25]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6471e688 // bfmmla z8.s, z20.h, z17.h\n" + ".inst 0x6470e68c // bfmmla z12.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6471e689 // bfmmla z9.s, z20.h, z17.h\n" + ".inst 0x6470e68d // bfmmla z13.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6471e68a // bfmmla z10.s, z20.h, z17.h\n" + ".inst 0x6470e68e // bfmmla z14.s, z20.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + ".inst 0x6471e68b // bfmmla z11.s, z20.h, z17.h\n" + ".inst 0x6470e68f // bfmmla z15.s, z20.h, z16.h\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqh { z19.h }, p0/Z, [x25]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x4\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "addvl x10, x10, #8\n" "ble 24f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e428 // bfmmla z8.s, z1.h, z17.h\n" + ".inst 0x6470e42c // bfmmla z12.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e429 // bfmmla z9.s, z1.h, z17.h\n" + ".inst 0x6470e42d // bfmmla z13.s, z1.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e42a // bfmmla z10.s, z1.h, z17.h\n" + ".inst 0x6470e42e // bfmmla z14.s, z1.h, z16.h\n" + "ld1h { z22.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6476e42b // bfmmla z11.s, z1.h, z22.h\n" + ".inst 0x6470e42f // bfmmla z15.s, z1.h, z16.h\n" "addvl x10, x10, #8\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -474,25 +474,25 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "uzp2 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z7.s, p5/M, z7.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "25:" // Height 2: No activation "st1w { z7.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -548,28 +548,28 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x20]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" @@ -601,13 +601,13 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -616,136 +616,136 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "34:" // Height 3: input setup done "cmp x27, #0x8\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqh { z30.h }, p0/Z, [x26]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "ld1rqh { z28.h }, p0/Z, [x24]\n" + "trn1 z27.d, z30.d, z24.d\n" + "trn2 z30.d, z30.d, z24.d\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "trn1 z26.d, z28.d, z29.d\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z29.d\n" + ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x8\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n" + ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n" + ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n" + ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n" + ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n" + ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n" + ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n" + ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n" + ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n" + ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + "trn1 z27.d, z1.d, z24.d\n" + "trn2 z1.d, z1.d, z24.d\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "trn1 z26.d, z3.d, z28.d\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e768 // bfmmla z8.s, z27.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e76c // bfmmla z12.s, z27.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e769 // bfmmla z9.s, z27.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6478e76d // bfmmla z13.s, z27.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z28.d\n" + ".inst 0x6479e76a // bfmmla z10.s, z27.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e76e // bfmmla z14.s, z27.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6479e76b // bfmmla z11.s, z27.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + ".inst 0x6478e76f // bfmmla z15.s, z27.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "ble 37f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n" + ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n" + ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n" + ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n" + ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n" + ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n" + ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n" + ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n" + ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n" + ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n" + ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -768,33 +768,33 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "uzp1 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmax z7.s, p5/M, z7.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" "38:" // Height 3: No activation "st1w { z7.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -854,37 +854,37 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x21]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" @@ -912,14 +912,14 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -929,140 +929,140 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "47:" // Height 4: input setup done "cmp x27, #0x8\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqh { z30.h }, p0/Z, [x26]\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "trn1 z29.d, z30.d, z24.d\n" + "ld1rqh { z28.h }, p0/Z, [x24]\n" + "ld1rqh { z27.h }, p0/Z, [x23]\n" + "trn2 z30.d, z30.d, z24.d\n" + "trn1 z26.d, z28.d, z27.d\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z27.d\n" + ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x8\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6479e7c8 // bfmmla z8.s, z30.h, z25.h\n" + ".inst 0x6479e790 // bfmmla z16.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6478e7cc // bfmmla z12.s, z30.h, z24.h\n" + ".inst 0x6478e794 // bfmmla z20.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e7c9 // bfmmla z9.s, z30.h, z25.h\n" + ".inst 0x6479e791 // bfmmla z17.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6478e7cd // bfmmla z13.s, z30.h, z24.h\n" + ".inst 0x6478e795 // bfmmla z21.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6479e7ca // bfmmla z10.s, z30.h, z25.h\n" + ".inst 0x6479e792 // bfmmla z18.s, z28.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6478e7ce // bfmmla z14.s, z30.h, z24.h\n" + ".inst 0x6478e796 // bfmmla z22.s, z28.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6479e7cb // bfmmla z11.s, z30.h, z25.h\n" + ".inst 0x6479e793 // bfmmla z19.s, z28.h, z25.h\n" + ".inst 0x6478e7cf // bfmmla z15.s, z30.h, z24.h\n" + ".inst 0x6478e797 // bfmmla z23.s, z28.h, z24.h\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z24.h }, p0/Z, [x25]\n" + "trn1 z28.d, z1.d, z24.d\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1rqh { z27.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z24.d\n" + "trn1 z26.d, z3.d, z27.d\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z27.d\n" + ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" + ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "ble 50f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6479e428 // bfmmla z8.s, z1.h, z25.h\n" + ".inst 0x6479e470 // bfmmla z16.s, z3.h, z25.h\n" + ".inst 0x6478e42c // bfmmla z12.s, z1.h, z24.h\n" + ".inst 0x6478e474 // bfmmla z20.s, z3.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e429 // bfmmla z9.s, z1.h, z25.h\n" + ".inst 0x6479e471 // bfmmla z17.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6478e42d // bfmmla z13.s, z1.h, z24.h\n" + ".inst 0x6478e475 // bfmmla z21.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e42a // bfmmla z10.s, z1.h, z25.h\n" + ".inst 0x6479e472 // bfmmla z18.s, z3.h, z25.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e42e // bfmmla z14.s, z1.h, z24.h\n" + ".inst 0x6478e476 // bfmmla z22.s, z3.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" + ".inst 0x6479e42b // bfmmla z11.s, z1.h, z25.h\n" + ".inst 0x6479e473 // bfmmla z19.s, z3.h, z25.h\n" + ".inst 0x6478e42f // bfmmla z15.s, z1.h, z24.h\n" + ".inst 0x6478e477 // bfmmla z23.s, z3.h, z24.h\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1090,41 +1090,41 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "uzp2 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z24.s\n" + "fmin z12.s, p5/M, z12.s, z24.s\n" + "fmin z13.s, p5/M, z13.s, z24.s\n" + "fmin z14.s, p5/M, z14.s, z24.s\n" + "fmin z8.s, p5/M, z8.s, z24.s\n" + "fmin z9.s, p5/M, z9.s, z24.s\n" + "fmin z10.s, p5/M, z10.s, z24.s\n" + "fmin z11.s, p5/M, z11.s, z24.s\n" + "fmin z15.s, p5/M, z15.s, z24.s\n" + "fmin z20.s, p5/M, z20.s, z24.s\n" + "fmin z21.s, p5/M, z21.s, z24.s\n" + "fmin z22.s, p5/M, z22.s, z24.s\n" + "fmin z16.s, p5/M, z16.s, z24.s\n" + "fmin z17.s, p5/M, z17.s, z24.s\n" + "fmin z18.s, p5/M, z18.s, z24.s\n" + "fmin z19.s, p5/M, z19.s, z24.s\n" + "fmax z7.s, p5/M, z7.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" "51:" // Height 4: No activation "st1w { z7.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1196,54 +1196,54 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x20]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z19.d, z24.d, z23.d\n" "zip2 z23.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z24.d, z25.d, z28.d\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1275,15 +1275,15 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1294,180 +1294,180 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "60:" // Height 5: input setup done "cmp x27, #0x8\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1rqh { z5.h }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqh { z6.h }, p0/Z, [x26]\n" + "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z7.h }, p0/Z, [x24]\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn1 z5.d, z6.d, z1.d\n" + "trn2 z6.d, z6.d, z1.d\n" + "ld1rqh { z4.h }, p0/Z, [x22]\n" + "trn1 z3.d, z7.d, z2.d\n" + "trn2 z7.d, z7.d, z2.d\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "trn1 z2.d, z4.d, z0.d\n" + "trn2 z4.d, z4.d, z0.d\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6461e4a8 // bfmmla z8.s, z5.h, z1.h\n" + ".inst 0x6461e470 // bfmmla z16.s, z3.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4ac // bfmmla z12.s, z5.h, z0.h\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" "add x25, x25, #0x10\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6461e471 // bfmmla z17.s, z3.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6460e4ad // bfmmla z13.s, z5.h, z0.h\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4aa // bfmmla z10.s, z5.h, z1.h\n" + ".inst 0x6461e472 // bfmmla z18.s, z3.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ae // bfmmla z14.s, z5.h, z0.h\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + ".inst 0x6461e473 // bfmmla z19.s, z3.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" + ".inst 0x6461e4f0 // bfmmla z16.s, z7.h, z1.h\n" + ".inst 0x6461e498 // bfmmla z24.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e4f4 // bfmmla z20.s, z7.h, z0.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" + ".inst 0x6461e4f1 // bfmmla z17.s, z7.h, z1.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" + ".inst 0x6461e4f2 // bfmmla z18.s, z7.h, z1.h\n" + ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e4f6 // bfmmla z22.s, z7.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" + ".inst 0x6461e4f3 // bfmmla z19.s, z7.h, z1.h\n" + ".inst 0x6461e49b // bfmmla z27.s, z4.h, z1.h\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e4f7 // bfmmla z23.s, z7.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" + "ld1rqh { z4.h }, p0/Z, [x25]\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn1 z7.d, z1.d, z4.d\n" + "trn2 z1.d, z1.d, z4.d\n" "ld1rqh { z5.h }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "trn1 z6.d, z3.d, z2.d\n" + "trn2 z3.d, z3.d, z2.d\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n" + ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n" + ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n" + ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n" + ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n" "addvl x10, x10, #8\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n" + ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "ble 63f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n" + ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" + ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n" + ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n" + ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n" + ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n" + ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n" + ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n" + ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n" + ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1500,49 +1500,49 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "uzp1 z27.d, z27.d, z31.d\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" - "add x20, %x[args_ptr], %[offset_min]\n" "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z7.s, p5/M, z7.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z7.s, p5/M, z7.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z7.s, p5/M, z7.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z7.s, p5/M, z7.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" + "fmax z24.s, p5/M, z24.s, z23.s\n" + "fmax z25.s, p5/M, z25.s, z23.s\n" + "fmax z26.s, p5/M, z26.s, z23.s\n" + "fmax z27.s, p5/M, z27.s, z23.s\n" "64:" // Height 5: No activation "st1w { z7.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1621,59 +1621,59 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z17.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" "add x21, x22, x20, LSL #2\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip2 z12.d, z9.d, z12.d\n" - "zip1 z9.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z17.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "zip2 z12.d, z17.d, z12.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z20.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z20.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" "zip2 z15.d, z16.d, z15.d\n" "zip1 z16.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x21]\n" "zip2 z21.d, z18.d, z21.d\n" "zip1 z18.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" "zip2 z23.d, z24.d, z23.d\n" "zip1 z24.d, z25.d, z28.d\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1705,16 +1705,16 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1726,184 +1726,184 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "73:" // Height 6: input setup done "cmp x27, #0x8\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1rqh { z5.h }, p0/Z, [x22]\n" - "ld1rqh { z6.h }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqh { z7.h }, p0/Z, [x26]\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" + "trn1 z6.d, z7.d, z0.d\n" + "ld1rqh { z5.h }, p0/Z, [x24]\n" + "ld1rqh { z1.h }, p0/Z, [x23]\n" + "trn2 z7.d, z7.d, z0.d\n" + "trn1 z4.d, z5.d, z1.d\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "ld1rqh { z0.h }, p0/Z, [x21]\n" + "trn2 z5.d, z5.d, z1.d\n" + "trn1 z2.d, z3.d, z0.d\n" + "trn2 z3.d, z3.d, z0.d\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" + ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x8\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" "add x25, x25, #0x10\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" "add x21, x21, #0x10\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" + ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" + ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n" + ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n" + ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n" + ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n" + ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" + ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n" + ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n" + ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n" + ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z1.h }, p0/Z, [x26]\n" - "ld1rqh { z2.h }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" + "trn1 z7.d, z1.d, z0.d\n" "ld1rqh { z3.h }, p0/Z, [x24]\n" - "ld1rqh { z4.h }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" + "ld1rqh { z2.h }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z0.d\n" + "trn1 z6.d, z3.d, z2.d\n" "ld1rqh { z5.h }, p0/Z, [x22]\n" - "ld1rqh { z6.h }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e408 // bfmmla z8.s, z0.h, z7.h\n" - ".inst 0x6467e450 // bfmmla z16.s, z2.h, z7.h\n" - ".inst 0x6467e498 // bfmmla z24.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqh { z0.h }, p0/Z, [x21]\n" + "trn2 z3.d, z3.d, z2.d\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e4e8 // bfmmla z8.s, z7.h, z2.h\n" + ".inst 0x6462e4d0 // bfmmla z16.s, z6.h, z2.h\n" + ".inst 0x6462e498 // bfmmla z24.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - ".inst 0x6466e49c // bfmmla z28.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" - ".inst 0x6467e451 // bfmmla z17.s, z2.h, z7.h\n" - ".inst 0x6467e499 // bfmmla z25.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6466e49d // bfmmla z29.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e40a // bfmmla z10.s, z0.h, z7.h\n" - ".inst 0x6467e452 // bfmmla z18.s, z2.h, z7.h\n" - ".inst 0x6467e49a // bfmmla z26.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e40e // bfmmla z14.s, z0.h, z6.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6466e49e // bfmmla z30.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n" + ".inst 0x6460e49c // bfmmla z28.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6462e4e9 // bfmmla z9.s, z7.h, z2.h\n" + ".inst 0x6462e4d1 // bfmmla z17.s, z6.h, z2.h\n" + ".inst 0x6462e499 // bfmmla z25.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6460e4d5 // bfmmla z21.s, z6.h, z0.h\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6462e4ea // bfmmla z10.s, z7.h, z2.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + ".inst 0x6460e4d6 // bfmmla z22.s, z6.h, z0.h\n" + ".inst 0x6460e49e // bfmmla z30.s, z4.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6462e4eb // bfmmla z11.s, z7.h, z2.h\n" "addvl x10, x10, #8\n" - ".inst 0x6467e453 // bfmmla z19.s, z2.h, z7.h\n" - ".inst 0x6467e49b // bfmmla z27.s, z4.h, z7.h\n" - ".inst 0x6466e40f // bfmmla z15.s, z0.h, z6.h\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6466e49f // bfmmla z31.s, z4.h, z6.h\n" + ".inst 0x6462e4d3 // bfmmla z19.s, z6.h, z2.h\n" + ".inst 0x6462e49b // bfmmla z27.s, z4.h, z2.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4d7 // bfmmla z23.s, z6.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "ble 76f\n" - "ld1h { z7.h }, p5/Z, [x10]\n" - "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6467e428 // bfmmla z8.s, z1.h, z7.h\n" - ".inst 0x6467e470 // bfmmla z16.s, z3.h, z7.h\n" - ".inst 0x6467e4b8 // bfmmla z24.s, z5.h, z7.h\n" - ".inst 0x6466e42c // bfmmla z12.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6466e474 // bfmmla z20.s, z3.h, z6.h\n" - ".inst 0x6466e4bc // bfmmla z28.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6467e429 // bfmmla z9.s, z1.h, z7.h\n" - ".inst 0x6467e471 // bfmmla z17.s, z3.h, z7.h\n" - ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" - ".inst 0x6466e42d // bfmmla z13.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6466e475 // bfmmla z21.s, z3.h, z6.h\n" - ".inst 0x6466e4bd // bfmmla z29.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6467e42a // bfmmla z10.s, z1.h, z7.h\n" - ".inst 0x6467e472 // bfmmla z18.s, z3.h, z7.h\n" - ".inst 0x6467e4ba // bfmmla z26.s, z5.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6466e476 // bfmmla z22.s, z3.h, z6.h\n" - ".inst 0x6466e4be // bfmmla z30.s, z5.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z2.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6462e428 // bfmmla z8.s, z1.h, z2.h\n" + ".inst 0x6462e470 // bfmmla z16.s, z3.h, z2.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6460e42c // bfmmla z12.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e474 // bfmmla z20.s, z3.h, z0.h\n" + ".inst 0x6460e4bc // bfmmla z28.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6462e429 // bfmmla z9.s, z1.h, z2.h\n" + ".inst 0x6462e471 // bfmmla z17.s, z3.h, z2.h\n" + ".inst 0x6462e4b9 // bfmmla z25.s, z5.h, z2.h\n" + ".inst 0x6460e42d // bfmmla z13.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e475 // bfmmla z21.s, z3.h, z0.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6462e42a // bfmmla z10.s, z1.h, z2.h\n" + ".inst 0x6462e472 // bfmmla z18.s, z3.h, z2.h\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6460e42e // bfmmla z14.s, z1.h, z0.h\n" + "ld1h { z2.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e476 // bfmmla z22.s, z3.h, z0.h\n" + ".inst 0x6460e4be // bfmmla z30.s, z5.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6467e42b // bfmmla z11.s, z1.h, z7.h\n" - ".inst 0x6467e473 // bfmmla z19.s, z3.h, z7.h\n" - ".inst 0x6467e4bb // bfmmla z27.s, z5.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6466e477 // bfmmla z23.s, z3.h, z6.h\n" - ".inst 0x6466e4bf // bfmmla z31.s, z5.h, z6.h\n" + ".inst 0x6462e42b // bfmmla z11.s, z1.h, z2.h\n" + ".inst 0x6462e473 // bfmmla z19.s, z3.h, z2.h\n" + ".inst 0x6462e4bb // bfmmla z27.s, z5.h, z2.h\n" + ".inst 0x6460e42f // bfmmla z15.s, z1.h, z0.h\n" + ".inst 0x6460e477 // bfmmla z23.s, z3.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2041,4 +2041,4 @@ void sve_hybrid_bf16fp32_mmla_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp index 6db9c0cdf3..b930e4c0d5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,16 +10,16 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE @@ -75,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -84,6 +83,8 @@ public: return { 12.44 }; case CPUModel::V1: return { 31.51 }; + case CPUModel::A64FX: + return { 49.14 }; } } @@ -107,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - -#endif // __aarch64__ +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp index a70e66cbe4..d1a9bb4a26 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp @@ -139,11 +139,11 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -159,12 +159,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "9:" // Height 1: Multiply loop: Main loop "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x26, x26, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1h { z6.h }, p4/Z, [x10]\n" @@ -174,27 +174,27 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" "addvl x10, x10, #4\n" "bne 6b\n" "tbz %x[flags], #1, 11f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z17.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" + "ld1rh { z16.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z17.h\n" + "fmin z9.h, p4/M, z9.h, z17.h\n" + "fmin z10.h, p4/M, z10.h, z17.h\n" + "fmin z11.h, p4/M, z11.h, z17.h\n" + "fmax z8.h, p4/M, z8.h, z16.h\n" + "fmax z9.h, p4/M, z9.h, z16.h\n" + "fmax z10.h, p4/M, z10.h, z16.h\n" + "fmax z11.h, p4/M, z11.h, z16.h\n" "11:" // Height 1: No activation "st1h { z8.h }, p3, [x9]\n" "st1h { z9.h }, p2, [x9, #1, MUL VL]\n" @@ -234,15 +234,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "15:" // Height 2: no bias "tbz %x[flags], #0, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" + "add x20, x9, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x9]\n" "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x20]\n" + "ld1h { z13.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x20, #3, MUL VL]\n" "b 17f\n" "16:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -258,12 +258,12 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -271,7 +271,7 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "b 20f\n" "19:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "20:" // Height 2: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -282,18 +282,18 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "21:" // Height 2: Multiply loop: Main loop "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z12.h, p4/M, z6.h, z1.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n" "add x26, x26, #0x2\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "subs x27, x27, #0x1\n" "add x25, x25, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z14.h, p4/M, z17.h, z1.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" + "fmla z15.h, p4/M, z16.h, z1.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1rh { z1.h }, p4/Z, [x25]\n" "ld1h { z6.h }, p4/Z, [x10]\n" @@ -303,41 +303,41 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.h, p4/M, z6.h, z0.h\n" "fmla z12.h, p4/M, z6.h, z1.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z17.h }, p4/Z, [x10, #2, MUL VL]\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" + "fmla z10.h, p4/M, z17.h, z0.h\n" + "fmla z14.h, p4/M, z17.h, z1.h\n" "addvl x10, x10, #4\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z11.h, p4/M, z16.h, z0.h\n" + "fmla z15.h, p4/M, z16.h, z1.h\n" "bne 18b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #1\n" "tbz %x[flags], #1, 23f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z17.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" + "ld1rh { z16.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z17.h\n" + "fmin z9.h, p4/M, z9.h, z17.h\n" + "fmin z10.h, p4/M, z10.h, z17.h\n" + "fmin z11.h, p4/M, z11.h, z17.h\n" + "fmin z12.h, p4/M, z12.h, z17.h\n" + "fmin z13.h, p4/M, z13.h, z17.h\n" + "fmin z14.h, p4/M, z14.h, z17.h\n" + "fmin z15.h, p4/M, z15.h, z17.h\n" + "fmax z8.h, p4/M, z8.h, z16.h\n" + "fmax z9.h, p4/M, z9.h, z16.h\n" + "fmax z10.h, p4/M, z10.h, z16.h\n" + "fmax z11.h, p4/M, z11.h, z16.h\n" + "fmax z12.h, p4/M, z12.h, z16.h\n" + "fmax z13.h, p4/M, z13.h, z16.h\n" + "fmax z14.h, p4/M, z14.h, z16.h\n" + "fmax z15.h, p4/M, z15.h, z16.h\n" "23:" // Height 2: No activation "st1h { z8.h }, p3, [x9]\n" "st1h { z9.h }, p2, [x9, #1, MUL VL]\n" @@ -385,20 +385,20 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "27:" // Height 3: no bias "tbz %x[flags], #0, 28f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x21, x9, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x9]\n" "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x21]\n" + "ld1h { z13.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x20]\n" + "ld1h { z17.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x20, #3, MUL VL]\n" "b 29f\n" "28:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -418,13 +418,13 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -433,8 +433,8 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "b 32f\n" "31:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "32:" // Height 3: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -450,21 +450,21 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x2\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x24, x24, #0x2\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z21.h, z0.h\n" + "fmla z14.h, p4/M, z21.h, z1.h\n" + "fmla z18.h, p4/M, z21.h, z2.h\n" + "fmla z11.h, p4/M, z20.h, z0.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1h { z6.h }, p4/Z, [x10]\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z15.h, p4/M, z20.h, z1.h\n" + "fmla z19.h, p4/M, z20.h, z2.h\n" "ld1rh { z1.h }, p4/Z, [x25]\n" "ld1rh { z2.h }, p4/Z, [x24]\n" "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n" @@ -476,51 +476,51 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z10.h, p4/M, z21.h, z0.h\n" + "fmla z14.h, p4/M, z21.h, z1.h\n" + "fmla z18.h, p4/M, z21.h, z2.h\n" + "fmla z11.h, p4/M, z20.h, z0.h\n" + "fmla z15.h, p4/M, z20.h, z1.h\n" + "fmla z19.h, p4/M, z20.h, z2.h\n" "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #1\n" "add x24, x25, x20, LSL #1\n" "tbz %x[flags], #1, 35f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z21.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" + "ld1rh { z20.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z21.h\n" + "fmin z9.h, p4/M, z9.h, z21.h\n" + "fmin z10.h, p4/M, z10.h, z21.h\n" + "fmin z11.h, p4/M, z11.h, z21.h\n" + "fmin z12.h, p4/M, z12.h, z21.h\n" + "fmin z13.h, p4/M, z13.h, z21.h\n" + "fmin z14.h, p4/M, z14.h, z21.h\n" + "fmin z15.h, p4/M, z15.h, z21.h\n" + "fmin z16.h, p4/M, z16.h, z21.h\n" + "fmin z17.h, p4/M, z17.h, z21.h\n" + "fmin z18.h, p4/M, z18.h, z21.h\n" + "fmin z19.h, p4/M, z19.h, z21.h\n" + "fmax z8.h, p4/M, z8.h, z20.h\n" + "fmax z9.h, p4/M, z9.h, z20.h\n" + "fmax z10.h, p4/M, z10.h, z20.h\n" + "fmax z11.h, p4/M, z11.h, z20.h\n" + "fmax z12.h, p4/M, z12.h, z20.h\n" + "fmax z13.h, p4/M, z13.h, z20.h\n" + "fmax z14.h, p4/M, z14.h, z20.h\n" + "fmax z15.h, p4/M, z15.h, z20.h\n" + "fmax z16.h, p4/M, z16.h, z20.h\n" + "fmax z17.h, p4/M, z17.h, z20.h\n" + "fmax z18.h, p4/M, z18.h, z20.h\n" + "fmax z19.h, p4/M, z19.h, z20.h\n" "35:" // Height 3: No activation "st1h { z8.h }, p3, [x9]\n" "st1h { z9.h }, p2, [x9, #1, MUL VL]\n" @@ -576,25 +576,25 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "39:" // Height 4: no bias "tbz %x[flags], #0, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x22, x9, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z8.h }, p3/Z, [x9]\n" - "add x23, x24, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x22]\n" + "ld1h { z13.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x21]\n" + "ld1h { z17.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x20]\n" + "ld1h { z21.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x20, #3, MUL VL]\n" "b 41f\n" "40:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -618,14 +618,14 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "42:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 43f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 44f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -635,9 +635,9 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "b 44f\n" "43:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "44:" // Height 4: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -654,7 +654,7 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z20.h, p4/M, z6.h, z3.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x2\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" @@ -662,19 +662,19 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x23, x23, #0x2\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "fmla z21.h, p4/M, z7.h, z3.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" + "fmla z10.h, p4/M, z25.h, z0.h\n" + "fmla z14.h, p4/M, z25.h, z1.h\n" + "fmla z18.h, p4/M, z25.h, z2.h\n" + "fmla z22.h, p4/M, z25.h, z3.h\n" "ld1h { z6.h }, p4/Z, [x10]\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" + "fmla z11.h, p4/M, z24.h, z0.h\n" + "fmla z15.h, p4/M, z24.h, z1.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1rh { z1.h }, p4/Z, [x25]\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z19.h, p4/M, z24.h, z2.h\n" + "fmla z23.h, p4/M, z24.h, z3.h\n" "ld1rh { z2.h }, p4/Z, [x24]\n" "ld1rh { z3.h }, p4/Z, [x23]\n" "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n" @@ -686,22 +686,22 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.h, p4/M, z6.h, z2.h\n" "fmla z20.h, p4/M, z6.h, z3.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "fmla z9.h, p4/M, z7.h, z0.h\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "fmla z21.h, p4/M, z7.h, z3.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" + "fmla z10.h, p4/M, z25.h, z0.h\n" + "fmla z14.h, p4/M, z25.h, z1.h\n" + "fmla z18.h, p4/M, z25.h, z2.h\n" + "fmla z22.h, p4/M, z25.h, z3.h\n" + "fmla z11.h, p4/M, z24.h, z0.h\n" + "fmla z15.h, p4/M, z24.h, z1.h\n" + "fmla z19.h, p4/M, z24.h, z2.h\n" + "fmla z23.h, p4/M, z24.h, z3.h\n" "bne 42b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #1\n" @@ -709,41 +709,41 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x23, x24, x20, LSL #1\n" "tbz %x[flags], #1, 47f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z25.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmin z20.h, p4/M, z20.h, z1.h\n" - "fmin z21.h, p4/M, z21.h, z1.h\n" - "fmin z22.h, p4/M, z22.h, z1.h\n" - "fmin z23.h, p4/M, z23.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" - "fmax z20.h, p4/M, z20.h, z0.h\n" - "fmax z21.h, p4/M, z21.h, z0.h\n" - "fmax z22.h, p4/M, z22.h, z0.h\n" - "fmax z23.h, p4/M, z23.h, z0.h\n" + "ld1rh { z24.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z25.h\n" + "fmin z9.h, p4/M, z9.h, z25.h\n" + "fmin z10.h, p4/M, z10.h, z25.h\n" + "fmin z11.h, p4/M, z11.h, z25.h\n" + "fmin z12.h, p4/M, z12.h, z25.h\n" + "fmin z13.h, p4/M, z13.h, z25.h\n" + "fmin z14.h, p4/M, z14.h, z25.h\n" + "fmin z15.h, p4/M, z15.h, z25.h\n" + "fmin z16.h, p4/M, z16.h, z25.h\n" + "fmin z17.h, p4/M, z17.h, z25.h\n" + "fmin z18.h, p4/M, z18.h, z25.h\n" + "fmin z19.h, p4/M, z19.h, z25.h\n" + "fmin z20.h, p4/M, z20.h, z25.h\n" + "fmin z21.h, p4/M, z21.h, z25.h\n" + "fmin z22.h, p4/M, z22.h, z25.h\n" + "fmin z23.h, p4/M, z23.h, z25.h\n" + "fmax z8.h, p4/M, z8.h, z24.h\n" + "fmax z9.h, p4/M, z9.h, z24.h\n" + "fmax z10.h, p4/M, z10.h, z24.h\n" + "fmax z11.h, p4/M, z11.h, z24.h\n" + "fmax z12.h, p4/M, z12.h, z24.h\n" + "fmax z13.h, p4/M, z13.h, z24.h\n" + "fmax z14.h, p4/M, z14.h, z24.h\n" + "fmax z15.h, p4/M, z15.h, z24.h\n" + "fmax z16.h, p4/M, z16.h, z24.h\n" + "fmax z17.h, p4/M, z17.h, z24.h\n" + "fmax z18.h, p4/M, z18.h, z24.h\n" + "fmax z19.h, p4/M, z19.h, z24.h\n" + "fmax z20.h, p4/M, z20.h, z24.h\n" + "fmax z21.h, p4/M, z21.h, z24.h\n" + "fmax z22.h, p4/M, z22.h, z24.h\n" + "fmax z23.h, p4/M, z23.h, z24.h\n" "47:" // Height 4: No activation "st1h { z8.h }, p3, [x9]\n" "st1h { z9.h }, p2, [x9, #1, MUL VL]\n" @@ -807,30 +807,30 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "51:" // Height 5: no bias "tbz %x[flags], #0, 52f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p3/Z, [x9]\n" - "add x23, x24, x20, LSL #1\n" + "add x23, x9, x20, LSL #1\n" "add x22, x23, x20, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x9]\n" + "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p3/Z, [x22]\n" - "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x23]\n" + "ld1h { z13.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x22]\n" + "ld1h { z17.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x21]\n" + "ld1h { z21.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x20]\n" + "ld1h { z25.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x20, #3, MUL VL]\n" "b 53f\n" "52:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -858,15 +858,15 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "54:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 55f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -877,10 +877,10 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "b 56f\n" "55:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "56:" // Height 5: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -902,29 +902,29 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x24, x24, #0x2\n" "fmla z24.h, p4/M, z6.h, z4.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n" "add x23, x23, #0x2\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "add x22, x22, #0x2\n" "fmla z21.h, p4/M, z7.h, z3.h\n" "fmla z25.h, p4/M, z7.h, z4.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z26.h, p4/M, z6.h, z4.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" + "fmla z10.h, p4/M, z29.h, z0.h\n" + "fmla z14.h, p4/M, z29.h, z1.h\n" + "fmla z18.h, p4/M, z29.h, z2.h\n" + "fmla z22.h, p4/M, z29.h, z3.h\n" + "fmla z26.h, p4/M, z29.h, z4.h\n" + "fmla z11.h, p4/M, z28.h, z0.h\n" "ld1rh { z0.h }, p4/Z, [x26]\n" "ld1h { z6.h }, p4/Z, [x10]\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" + "fmla z15.h, p4/M, z28.h, z1.h\n" + "fmla z19.h, p4/M, z28.h, z2.h\n" "ld1rh { z1.h }, p4/Z, [x25]\n" "ld1rh { z2.h }, p4/Z, [x24]\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" - "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z23.h, p4/M, z28.h, z3.h\n" + "fmla z27.h, p4/M, z28.h, z4.h\n" "ld1rh { z3.h }, p4/Z, [x23]\n" "ld1rh { z4.h }, p4/Z, [x22]\n" "ld1h { z7.h }, p4/Z, [x10, #1, MUL VL]\n" @@ -939,23 +939,23 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "cmp x28, x20\n" "fmla z24.h, p4/M, z6.h, z4.h\n" "fmla z9.h, p4/M, z7.h, z0.h\n" - "ld1h { z6.h }, p4/Z, [x10, #2, MUL VL]\n" + "ld1h { z29.h }, p4/Z, [x10, #2, MUL VL]\n" "fmla z13.h, p4/M, z7.h, z1.h\n" "fmla z17.h, p4/M, z7.h, z2.h\n" "fmla z21.h, p4/M, z7.h, z3.h\n" "fmla z25.h, p4/M, z7.h, z4.h\n" - "ld1h { z7.h }, p4/Z, [x10, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, p4/M, z6.h, z0.h\n" - "fmla z14.h, p4/M, z6.h, z1.h\n" - "fmla z18.h, p4/M, z6.h, z2.h\n" - "fmla z22.h, p4/M, z6.h, z3.h\n" - "fmla z26.h, p4/M, z6.h, z4.h\n" - "fmla z11.h, p4/M, z7.h, z0.h\n" - "fmla z15.h, p4/M, z7.h, z1.h\n" - "fmla z19.h, p4/M, z7.h, z2.h\n" - "fmla z23.h, p4/M, z7.h, z3.h\n" - "fmla z27.h, p4/M, z7.h, z4.h\n" + "fmla z10.h, p4/M, z29.h, z0.h\n" + "fmla z14.h, p4/M, z29.h, z1.h\n" + "fmla z18.h, p4/M, z29.h, z2.h\n" + "fmla z22.h, p4/M, z29.h, z3.h\n" + "fmla z26.h, p4/M, z29.h, z4.h\n" + "fmla z11.h, p4/M, z28.h, z0.h\n" + "fmla z15.h, p4/M, z28.h, z1.h\n" + "fmla z19.h, p4/M, z28.h, z2.h\n" + "fmla z23.h, p4/M, z28.h, z3.h\n" + "fmla z27.h, p4/M, z28.h, z4.h\n" "bne 54b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #1\n" @@ -964,49 +964,49 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "add x22, x23, x20, LSL #1\n" "tbz %x[flags], #1, 59f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p4/Z, [x20]\n" + "ld1rh { z29.h }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p4/Z, [x20]\n" - "fmin z8.h, p4/M, z8.h, z1.h\n" - "fmin z9.h, p4/M, z9.h, z1.h\n" - "fmin z10.h, p4/M, z10.h, z1.h\n" - "fmin z11.h, p4/M, z11.h, z1.h\n" - "fmin z12.h, p4/M, z12.h, z1.h\n" - "fmin z13.h, p4/M, z13.h, z1.h\n" - "fmin z14.h, p4/M, z14.h, z1.h\n" - "fmin z15.h, p4/M, z15.h, z1.h\n" - "fmin z16.h, p4/M, z16.h, z1.h\n" - "fmin z17.h, p4/M, z17.h, z1.h\n" - "fmin z18.h, p4/M, z18.h, z1.h\n" - "fmin z19.h, p4/M, z19.h, z1.h\n" - "fmin z20.h, p4/M, z20.h, z1.h\n" - "fmin z21.h, p4/M, z21.h, z1.h\n" - "fmin z22.h, p4/M, z22.h, z1.h\n" - "fmin z23.h, p4/M, z23.h, z1.h\n" - "fmin z24.h, p4/M, z24.h, z1.h\n" - "fmin z25.h, p4/M, z25.h, z1.h\n" - "fmin z26.h, p4/M, z26.h, z1.h\n" - "fmin z27.h, p4/M, z27.h, z1.h\n" - "fmax z8.h, p4/M, z8.h, z0.h\n" - "fmax z9.h, p4/M, z9.h, z0.h\n" - "fmax z10.h, p4/M, z10.h, z0.h\n" - "fmax z11.h, p4/M, z11.h, z0.h\n" - "fmax z12.h, p4/M, z12.h, z0.h\n" - "fmax z13.h, p4/M, z13.h, z0.h\n" - "fmax z14.h, p4/M, z14.h, z0.h\n" - "fmax z15.h, p4/M, z15.h, z0.h\n" - "fmax z16.h, p4/M, z16.h, z0.h\n" - "fmax z17.h, p4/M, z17.h, z0.h\n" - "fmax z18.h, p4/M, z18.h, z0.h\n" - "fmax z19.h, p4/M, z19.h, z0.h\n" - "fmax z20.h, p4/M, z20.h, z0.h\n" - "fmax z21.h, p4/M, z21.h, z0.h\n" - "fmax z22.h, p4/M, z22.h, z0.h\n" - "fmax z23.h, p4/M, z23.h, z0.h\n" - "fmax z24.h, p4/M, z24.h, z0.h\n" - "fmax z25.h, p4/M, z25.h, z0.h\n" - "fmax z26.h, p4/M, z26.h, z0.h\n" - "fmax z27.h, p4/M, z27.h, z0.h\n" + "ld1rh { z28.h }, p4/Z, [x20]\n" + "fmin z8.h, p4/M, z8.h, z29.h\n" + "fmin z9.h, p4/M, z9.h, z29.h\n" + "fmin z10.h, p4/M, z10.h, z29.h\n" + "fmin z11.h, p4/M, z11.h, z29.h\n" + "fmin z12.h, p4/M, z12.h, z29.h\n" + "fmin z13.h, p4/M, z13.h, z29.h\n" + "fmin z14.h, p4/M, z14.h, z29.h\n" + "fmin z15.h, p4/M, z15.h, z29.h\n" + "fmin z16.h, p4/M, z16.h, z29.h\n" + "fmin z17.h, p4/M, z17.h, z29.h\n" + "fmin z18.h, p4/M, z18.h, z29.h\n" + "fmin z19.h, p4/M, z19.h, z29.h\n" + "fmin z20.h, p4/M, z20.h, z29.h\n" + "fmin z21.h, p4/M, z21.h, z29.h\n" + "fmin z22.h, p4/M, z22.h, z29.h\n" + "fmin z23.h, p4/M, z23.h, z29.h\n" + "fmin z24.h, p4/M, z24.h, z29.h\n" + "fmin z25.h, p4/M, z25.h, z29.h\n" + "fmin z26.h, p4/M, z26.h, z29.h\n" + "fmin z27.h, p4/M, z27.h, z29.h\n" + "fmax z8.h, p4/M, z8.h, z28.h\n" + "fmax z9.h, p4/M, z9.h, z28.h\n" + "fmax z10.h, p4/M, z10.h, z28.h\n" + "fmax z11.h, p4/M, z11.h, z28.h\n" + "fmax z12.h, p4/M, z12.h, z28.h\n" + "fmax z13.h, p4/M, z13.h, z28.h\n" + "fmax z14.h, p4/M, z14.h, z28.h\n" + "fmax z15.h, p4/M, z15.h, z28.h\n" + "fmax z16.h, p4/M, z16.h, z28.h\n" + "fmax z17.h, p4/M, z17.h, z28.h\n" + "fmax z18.h, p4/M, z18.h, z28.h\n" + "fmax z19.h, p4/M, z19.h, z28.h\n" + "fmax z20.h, p4/M, z20.h, z28.h\n" + "fmax z21.h, p4/M, z21.h, z28.h\n" + "fmax z22.h, p4/M, z22.h, z28.h\n" + "fmax z23.h, p4/M, z23.h, z28.h\n" + "fmax z24.h, p4/M, z24.h, z28.h\n" + "fmax z25.h, p4/M, z25.h, z28.h\n" + "fmax z26.h, p4/M, z26.h, z28.h\n" + "fmax z27.h, p4/M, z27.h, z28.h\n" "59:" // Height 5: No activation "st1h { z8.h }, p3, [x9]\n" "st1h { z9.h }, p2, [x9, #1, MUL VL]\n" @@ -1081,35 +1081,35 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "63:" // Height 6: no bias "tbz %x[flags], #0, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p3/Z, [x9]\n" + "add x24, x9, x20, LSL #1\n" "add x23, x24, x20, LSL #1\n" + "ld1h { z8.h }, p3/Z, [x9]\n" "add x22, x23, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z9.h }, p2/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p1/Z, [x9, #2, MUL VL]\n" - "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z11.h }, p0/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p3/Z, [x25]\n" - "ld1h { z13.h }, p2/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p1/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p0/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p3/Z, [x24]\n" - "ld1h { z17.h }, p2/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p0/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p3/Z, [x23]\n" - "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p1/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p0/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p3/Z, [x22]\n" - "ld1h { z25.h }, p2/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p1/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p0/Z, [x22, #3, MUL VL]\n" - "ld1h { z28.h }, p3/Z, [x21]\n" - "ld1h { z29.h }, p2/Z, [x21, #1, MUL VL]\n" - "ld1h { z30.h }, p1/Z, [x21, #2, MUL VL]\n" - "ld1h { z31.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z12.h }, p3/Z, [x24]\n" + "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p3/Z, [x23]\n" + "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p3/Z, [x22]\n" + "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p3/Z, [x21]\n" + "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p3/Z, [x20]\n" + "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n" "b 65f\n" "64:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1141,16 +1141,16 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "66:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 67f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1162,11 +1162,11 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "b 68f\n" "67:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "68:" // Height 6: input setup done "subs x27, x27, #0x1\n" "ld1rh { z0.h }, p4/Z, [x26]\n" @@ -1355,7 +1355,6 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "74:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1363,4 +1362,4 @@ void sve_hybrid_fp16_mla_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp index 6f0b3e0008..041825df6b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp @@ -139,11 +139,11 @@ void sve_hybrid_fp16_mla_6x4VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -156,164 +156,164 @@ void sve_hybrid_fp16_mla_6x4VL ( "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10]\n" + "fmla z8.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z8.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z10.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z11.h, z16.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[2]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[2]\n" + "fmla z11.h, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[3]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[3]\n" + "fmla z11.h, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[4]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[4]\n" + "fmla z11.h, z16.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[5]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[5]\n" + "fmla z11.h, z16.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[6]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[6]\n" + "fmla z11.h, z16.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[7]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x8\n" "cmp x27, #0x8\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" "add x26, x26, #0x10\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10]\n" + "fmla z8.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[0]\n" + "fmla z11.h, z16.h, z0.h[0]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[1]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" + "fmla z10.h, z17.h, z0.h[1]\n" + "fmla z11.h, z16.h, z0.h[1]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[2]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" + "fmla z10.h, z17.h, z0.h[2]\n" + "fmla z11.h, z16.h, z0.h[2]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[3]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" + "fmla z10.h, z17.h, z0.h[3]\n" + "fmla z11.h, z16.h, z0.h[3]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[4]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" + "fmla z10.h, z17.h, z0.h[4]\n" + "fmla z11.h, z16.h, z0.h[4]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[5]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" + "fmla z10.h, z17.h, z0.h[5]\n" + "fmla z11.h, z16.h, z0.h[5]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[6]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" + "fmla z10.h, z17.h, z0.h[6]\n" + "fmla z11.h, z16.h, z0.h[6]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[7]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" "addvl x10, x10, #4\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -322,17 +322,17 @@ void sve_hybrid_fp16_mla_6x4VL ( "bne 6b\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z17.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" + "ld1rh { z16.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z17.h\n" + "fmin z9.h, p5/M, z9.h, z17.h\n" + "fmin z10.h, p5/M, z10.h, z17.h\n" + "fmin z11.h, p5/M, z11.h, z17.h\n" + "fmax z8.h, p5/M, z8.h, z16.h\n" + "fmax z9.h, p5/M, z9.h, z16.h\n" + "fmax z10.h, p5/M, z10.h, z16.h\n" + "fmax z11.h, p5/M, z11.h, z16.h\n" "12:" // Height 1: No activation "st1h { z8.h }, p4, [x9]\n" "st1h { z9.h }, p3, [x9, #1, MUL VL]\n" @@ -372,15 +372,15 @@ void sve_hybrid_fp16_mla_6x4VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" + "add x20, x9, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x9]\n" "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x20]\n" + "ld1h { z13.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x20, #3, MUL VL]\n" "b 18f\n" "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -396,12 +396,12 @@ void sve_hybrid_fp16_mla_6x4VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -409,239 +409,239 @@ void sve_hybrid_fp16_mla_6x4VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" "21:" // Height 2: input setup done "cmp x27, #0x8\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z1.h }, p0/Z, [x26]\n" + "ld1rqh { z0.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[0]\n" + "fmla z12.h, z17.h, z0.h[0]\n" + "fmla z9.h, z16.h, z1.h[0]\n" + "fmla z13.h, z16.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[0]\n" + "fmla z14.h, z17.h, z0.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" "cmp x27, #0x8\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[0]\n" + "fmla z15.h, z16.h, z0.h[0]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" "add x26, x26, #0x10\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[1]\n" + "fmla z12.h, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[1]\n" + "fmla z13.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[1]\n" + "fmla z14.h, z17.h, z0.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[1]\n" + "fmla z15.h, z16.h, z0.h[1]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[2]\n" + "fmla z12.h, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[2]\n" + "fmla z13.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[2]\n" + "fmla z14.h, z17.h, z0.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[2]\n" + "fmla z15.h, z16.h, z0.h[2]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[3]\n" + "fmla z12.h, z17.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[3]\n" + "fmla z13.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[3]\n" + "fmla z14.h, z17.h, z0.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "fmla z11.h, z16.h, z1.h[3]\n" + "fmla z15.h, z16.h, z0.h[3]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[4]\n" + "fmla z12.h, z17.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[4]\n" + "fmla z13.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[4]\n" + "fmla z14.h, z17.h, z0.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[4]\n" + "fmla z15.h, z16.h, z0.h[4]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[5]\n" + "fmla z12.h, z17.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[5]\n" + "fmla z13.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z10.h, z17.h, z1.h[5]\n" + "fmla z14.h, z17.h, z0.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[5]\n" + "fmla z15.h, z16.h, z0.h[5]\n" + "ld1h { z16.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[6]\n" + "fmla z12.h, z17.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[6]\n" + "fmla z13.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[6]\n" + "fmla z14.h, z17.h, z0.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z16.h, z1.h[6]\n" + "fmla z15.h, z16.h, z0.h[6]\n" + "ld1h { z16.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z17.h, z1.h[7]\n" + "fmla z12.h, z17.h, z0.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z16.h, z1.h[7]\n" + "fmla z13.h, z16.h, z0.h[7]\n" + "ld1h { z16.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z17.h, z1.h[7]\n" + "fmla z14.h, z17.h, z0.h[7]\n" + "fmla z11.h, z16.h, z1.h[7]\n" + "fmla z15.h, z16.h, z0.h[7]\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" "ld1rqh { z0.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[0]\n" + "fmla z12.h, z17.h, z1.h[0]\n" + "fmla z9.h, z16.h, z0.h[0]\n" + "fmla z13.h, z16.h, z1.h[0]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[0]\n" + "fmla z14.h, z17.h, z1.h[0]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" + "fmla z11.h, z16.h, z0.h[0]\n" + "fmla z15.h, z16.h, z1.h[0]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[1]\n" + "fmla z12.h, z17.h, z1.h[1]\n" + "fmla z9.h, z16.h, z0.h[1]\n" + "fmla z13.h, z16.h, z1.h[1]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" + "fmla z10.h, z17.h, z0.h[1]\n" + "fmla z14.h, z17.h, z1.h[1]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" + "fmla z11.h, z16.h, z0.h[1]\n" + "fmla z15.h, z16.h, z1.h[1]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[2]\n" + "fmla z12.h, z17.h, z1.h[2]\n" + "fmla z9.h, z16.h, z0.h[2]\n" + "fmla z13.h, z16.h, z1.h[2]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" + "fmla z10.h, z17.h, z0.h[2]\n" + "fmla z14.h, z17.h, z1.h[2]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" + "fmla z11.h, z16.h, z0.h[2]\n" + "fmla z15.h, z16.h, z1.h[2]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[3]\n" + "fmla z12.h, z17.h, z1.h[3]\n" + "fmla z9.h, z16.h, z0.h[3]\n" + "fmla z13.h, z16.h, z1.h[3]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" + "fmla z10.h, z17.h, z0.h[3]\n" + "fmla z14.h, z17.h, z1.h[3]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" + "fmla z11.h, z16.h, z0.h[3]\n" + "fmla z15.h, z16.h, z1.h[3]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[4]\n" + "fmla z12.h, z17.h, z1.h[4]\n" + "fmla z9.h, z16.h, z0.h[4]\n" + "fmla z13.h, z16.h, z1.h[4]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" + "fmla z10.h, z17.h, z0.h[4]\n" + "fmla z14.h, z17.h, z1.h[4]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" + "fmla z11.h, z16.h, z0.h[4]\n" + "fmla z15.h, z16.h, z1.h[4]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[5]\n" + "fmla z12.h, z17.h, z1.h[5]\n" + "fmla z9.h, z16.h, z0.h[5]\n" + "fmla z13.h, z16.h, z1.h[5]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" + "fmla z10.h, z17.h, z0.h[5]\n" + "fmla z14.h, z17.h, z1.h[5]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" + "fmla z11.h, z16.h, z0.h[5]\n" + "fmla z15.h, z16.h, z1.h[5]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[6]\n" + "fmla z12.h, z17.h, z1.h[6]\n" + "fmla z9.h, z16.h, z0.h[6]\n" + "fmla z13.h, z16.h, z1.h[6]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" + "fmla z10.h, z17.h, z0.h[6]\n" + "fmla z14.h, z17.h, z1.h[6]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" + "fmla z11.h, z16.h, z0.h[6]\n" + "fmla z15.h, z16.h, z1.h[6]\n" "ble 24f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z17.h, z0.h[7]\n" + "fmla z12.h, z17.h, z1.h[7]\n" + "fmla z9.h, z16.h, z0.h[7]\n" + "fmla z13.h, z16.h, z1.h[7]\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z17.h, z0.h[7]\n" + "fmla z14.h, z17.h, z1.h[7]\n" "addvl x10, x10, #4\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" + "fmla z11.h, z16.h, z0.h[7]\n" + "fmla z15.h, z16.h, z1.h[7]\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -651,25 +651,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "add x25, x9, x20, LSL #1\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z17.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" + "ld1rh { z16.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z17.h\n" + "fmin z9.h, p5/M, z9.h, z17.h\n" + "fmin z10.h, p5/M, z10.h, z17.h\n" + "fmin z11.h, p5/M, z11.h, z17.h\n" + "fmin z12.h, p5/M, z12.h, z17.h\n" + "fmin z13.h, p5/M, z13.h, z17.h\n" + "fmin z14.h, p5/M, z14.h, z17.h\n" + "fmin z15.h, p5/M, z15.h, z17.h\n" + "fmax z8.h, p5/M, z8.h, z16.h\n" + "fmax z9.h, p5/M, z9.h, z16.h\n" + "fmax z10.h, p5/M, z10.h, z16.h\n" + "fmax z11.h, p5/M, z11.h, z16.h\n" + "fmax z12.h, p5/M, z12.h, z16.h\n" + "fmax z13.h, p5/M, z13.h, z16.h\n" + "fmax z14.h, p5/M, z14.h, z16.h\n" + "fmax z15.h, p5/M, z15.h, z16.h\n" "25:" // Height 2: No activation "st1h { z8.h }, p4, [x9]\n" "st1h { z9.h }, p3, [x9, #1, MUL VL]\n" @@ -717,20 +717,20 @@ void sve_hybrid_fp16_mla_6x4VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x21, x9, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x9]\n" "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x21]\n" + "ld1h { z13.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x20]\n" + "ld1h { z17.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x20, #3, MUL VL]\n" "b 31f\n" "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -750,13 +750,13 @@ void sve_hybrid_fp16_mla_6x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -765,151 +765,151 @@ void sve_hybrid_fp16_mla_6x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" "34:" // Height 3: input setup done "cmp x27, #0x8\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x26]\n" "ld1rqh { z1.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1rqh { z0.h }, p0/Z, [x24]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "fmla z8.h, z21.h, z2.h[0]\n" + "fmla z12.h, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.h, z21.h, z0.h[0]\n" + "fmla z9.h, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[0]\n" + "fmla z17.h, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "cmp x27, #0x8\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" + "fmla z10.h, z21.h, z2.h[0]\n" + "fmla z14.h, z21.h, z1.h[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z18.h, z21.h, z0.h[0]\n" + "fmla z11.h, z20.h, z2.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[0]\n" + "fmla z19.h, z20.h, z0.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[1]\n" + "fmla z12.h, z21.h, z1.h[1]\n" + "fmla z16.h, z21.h, z0.h[1]\n" + "fmla z9.h, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[1]\n" + "fmla z17.h, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[1]\n" + "fmla z14.h, z21.h, z1.h[1]\n" + "fmla z18.h, z21.h, z0.h[1]\n" + "fmla z11.h, z20.h, z2.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[1]\n" + "fmla z19.h, z20.h, z0.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[2]\n" + "fmla z12.h, z21.h, z1.h[2]\n" + "fmla z16.h, z21.h, z0.h[2]\n" + "fmla z9.h, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[2]\n" + "fmla z17.h, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[2]\n" + "fmla z14.h, z21.h, z1.h[2]\n" + "fmla z18.h, z21.h, z0.h[2]\n" + "fmla z11.h, z20.h, z2.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[2]\n" + "fmla z19.h, z20.h, z0.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[3]\n" + "fmla z12.h, z21.h, z1.h[3]\n" + "fmla z16.h, z21.h, z0.h[3]\n" + "fmla z9.h, z20.h, z2.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[3]\n" + "fmla z17.h, z20.h, z0.h[3]\n" + "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[3]\n" + "fmla z14.h, z21.h, z1.h[3]\n" + "fmla z18.h, z21.h, z0.h[3]\n" + "fmla z11.h, z20.h, z2.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "fmla z15.h, z20.h, z1.h[3]\n" + "fmla z19.h, z20.h, z0.h[3]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[4]\n" + "fmla z12.h, z21.h, z1.h[4]\n" + "fmla z16.h, z21.h, z0.h[4]\n" + "fmla z9.h, z20.h, z2.h[4]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[4]\n" + "fmla z17.h, z20.h, z0.h[4]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[4]\n" + "fmla z14.h, z21.h, z1.h[4]\n" + "fmla z18.h, z21.h, z0.h[4]\n" + "fmla z11.h, z20.h, z2.h[4]\n" + "ld1h { z21.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[4]\n" + "fmla z19.h, z20.h, z0.h[4]\n" + "ld1h { z20.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[5]\n" + "fmla z12.h, z21.h, z1.h[5]\n" + "fmla z16.h, z21.h, z0.h[5]\n" + "fmla z9.h, z20.h, z2.h[5]\n" + "ld1h { z21.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[5]\n" + "fmla z17.h, z20.h, z0.h[5]\n" + "ld1h { z20.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z10.h, z21.h, z2.h[5]\n" + "fmla z14.h, z21.h, z1.h[5]\n" + "fmla z18.h, z21.h, z0.h[5]\n" + "fmla z11.h, z20.h, z2.h[5]\n" + "ld1h { z21.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[5]\n" + "fmla z19.h, z20.h, z0.h[5]\n" + "ld1h { z20.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[6]\n" + "fmla z12.h, z21.h, z1.h[6]\n" + "fmla z16.h, z21.h, z0.h[6]\n" + "fmla z9.h, z20.h, z2.h[6]\n" + "ld1h { z21.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[6]\n" + "fmla z17.h, z20.h, z0.h[6]\n" + "ld1h { z20.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[6]\n" + "fmla z14.h, z21.h, z1.h[6]\n" + "fmla z18.h, z21.h, z0.h[6]\n" + "fmla z11.h, z20.h, z2.h[6]\n" + "ld1h { z21.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.h, z20.h, z1.h[6]\n" + "fmla z19.h, z20.h, z0.h[6]\n" + "ld1h { z20.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z21.h, z2.h[7]\n" + "fmla z12.h, z21.h, z1.h[7]\n" + "fmla z16.h, z21.h, z0.h[7]\n" + "fmla z9.h, z20.h, z2.h[7]\n" + "ld1h { z21.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[7]\n" + "fmla z17.h, z20.h, z0.h[7]\n" + "ld1h { z20.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z21.h, z2.h[7]\n" + "fmla z14.h, z21.h, z1.h[7]\n" + "fmla z18.h, z21.h, z0.h[7]\n" + "fmla z11.h, z20.h, z2.h[7]\n" + "fmla z15.h, z20.h, z1.h[7]\n" + "fmla z19.h, z20.h, z0.h[7]\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -917,155 +917,155 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z1.h }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "fmla z8.h, z21.h, z0.h[0]\n" + "fmla z12.h, z21.h, z1.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.h, z21.h, z2.h[0]\n" + "fmla z9.h, z20.h, z0.h[0]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[0]\n" + "fmla z17.h, z20.h, z2.h[0]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" + "fmla z10.h, z21.h, z0.h[0]\n" + "fmla z14.h, z21.h, z1.h[0]\n" + "fmla z18.h, z21.h, z2.h[0]\n" + "fmla z11.h, z20.h, z0.h[0]\n" + "fmla z15.h, z20.h, z1.h[0]\n" + "fmla z19.h, z20.h, z2.h[0]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[1]\n" + "fmla z12.h, z21.h, z1.h[1]\n" + "fmla z16.h, z21.h, z2.h[1]\n" + "fmla z9.h, z20.h, z0.h[1]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[1]\n" + "fmla z17.h, z20.h, z2.h[1]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" + "fmla z10.h, z21.h, z0.h[1]\n" + "fmla z14.h, z21.h, z1.h[1]\n" + "fmla z18.h, z21.h, z2.h[1]\n" + "fmla z11.h, z20.h, z0.h[1]\n" + "fmla z15.h, z20.h, z1.h[1]\n" + "fmla z19.h, z20.h, z2.h[1]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[2]\n" + "fmla z12.h, z21.h, z1.h[2]\n" + "fmla z16.h, z21.h, z2.h[2]\n" + "fmla z9.h, z20.h, z0.h[2]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[2]\n" + "fmla z17.h, z20.h, z2.h[2]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" + "fmla z10.h, z21.h, z0.h[2]\n" + "fmla z14.h, z21.h, z1.h[2]\n" + "fmla z18.h, z21.h, z2.h[2]\n" + "fmla z11.h, z20.h, z0.h[2]\n" + "fmla z15.h, z20.h, z1.h[2]\n" + "fmla z19.h, z20.h, z2.h[2]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[3]\n" + "fmla z12.h, z21.h, z1.h[3]\n" + "fmla z16.h, z21.h, z2.h[3]\n" + "fmla z9.h, z20.h, z0.h[3]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[3]\n" + "fmla z17.h, z20.h, z2.h[3]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" + "fmla z10.h, z21.h, z0.h[3]\n" + "fmla z14.h, z21.h, z1.h[3]\n" + "fmla z18.h, z21.h, z2.h[3]\n" + "fmla z11.h, z20.h, z0.h[3]\n" + "fmla z15.h, z20.h, z1.h[3]\n" + "fmla z19.h, z20.h, z2.h[3]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[4]\n" + "fmla z12.h, z21.h, z1.h[4]\n" + "fmla z16.h, z21.h, z2.h[4]\n" + "fmla z9.h, z20.h, z0.h[4]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[4]\n" + "fmla z17.h, z20.h, z2.h[4]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" + "fmla z10.h, z21.h, z0.h[4]\n" + "fmla z14.h, z21.h, z1.h[4]\n" + "fmla z18.h, z21.h, z2.h[4]\n" + "fmla z11.h, z20.h, z0.h[4]\n" + "fmla z15.h, z20.h, z1.h[4]\n" + "fmla z19.h, z20.h, z2.h[4]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[5]\n" + "fmla z12.h, z21.h, z1.h[5]\n" + "fmla z16.h, z21.h, z2.h[5]\n" + "fmla z9.h, z20.h, z0.h[5]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[5]\n" + "fmla z17.h, z20.h, z2.h[5]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" + "fmla z10.h, z21.h, z0.h[5]\n" + "fmla z14.h, z21.h, z1.h[5]\n" + "fmla z18.h, z21.h, z2.h[5]\n" + "fmla z11.h, z20.h, z0.h[5]\n" + "fmla z15.h, z20.h, z1.h[5]\n" + "fmla z19.h, z20.h, z2.h[5]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[6]\n" + "fmla z12.h, z21.h, z1.h[6]\n" + "fmla z16.h, z21.h, z2.h[6]\n" + "fmla z9.h, z20.h, z0.h[6]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[6]\n" + "fmla z17.h, z20.h, z2.h[6]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" + "fmla z10.h, z21.h, z0.h[6]\n" + "fmla z14.h, z21.h, z1.h[6]\n" + "fmla z18.h, z21.h, z2.h[6]\n" + "fmla z11.h, z20.h, z0.h[6]\n" + "fmla z15.h, z20.h, z1.h[6]\n" + "fmla z19.h, z20.h, z2.h[6]\n" "ble 37f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z21.h }, p5/Z, [x10]\n" + "ld1h { z20.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z21.h, z0.h[7]\n" + "fmla z12.h, z21.h, z1.h[7]\n" + "fmla z16.h, z21.h, z2.h[7]\n" + "fmla z9.h, z20.h, z0.h[7]\n" + "ld1h { z21.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z20.h, z1.h[7]\n" + "fmla z17.h, z20.h, z2.h[7]\n" + "ld1h { z20.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" + "fmla z10.h, z21.h, z0.h[7]\n" + "fmla z14.h, z21.h, z1.h[7]\n" + "fmla z18.h, z21.h, z2.h[7]\n" + "fmla z11.h, z20.h, z0.h[7]\n" + "fmla z15.h, z20.h, z1.h[7]\n" + "fmla z19.h, z20.h, z2.h[7]\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1076,33 +1076,33 @@ void sve_hybrid_fp16_mla_6x4VL ( "add x24, x25, x20, LSL #1\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z21.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" + "ld1rh { z20.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z21.h\n" + "fmin z9.h, p5/M, z9.h, z21.h\n" + "fmin z10.h, p5/M, z10.h, z21.h\n" + "fmin z11.h, p5/M, z11.h, z21.h\n" + "fmin z12.h, p5/M, z12.h, z21.h\n" + "fmin z13.h, p5/M, z13.h, z21.h\n" + "fmin z14.h, p5/M, z14.h, z21.h\n" + "fmin z15.h, p5/M, z15.h, z21.h\n" + "fmin z16.h, p5/M, z16.h, z21.h\n" + "fmin z17.h, p5/M, z17.h, z21.h\n" + "fmin z18.h, p5/M, z18.h, z21.h\n" + "fmin z19.h, p5/M, z19.h, z21.h\n" + "fmax z8.h, p5/M, z8.h, z20.h\n" + "fmax z9.h, p5/M, z9.h, z20.h\n" + "fmax z10.h, p5/M, z10.h, z20.h\n" + "fmax z11.h, p5/M, z11.h, z20.h\n" + "fmax z12.h, p5/M, z12.h, z20.h\n" + "fmax z13.h, p5/M, z13.h, z20.h\n" + "fmax z14.h, p5/M, z14.h, z20.h\n" + "fmax z15.h, p5/M, z15.h, z20.h\n" + "fmax z16.h, p5/M, z16.h, z20.h\n" + "fmax z17.h, p5/M, z17.h, z20.h\n" + "fmax z18.h, p5/M, z18.h, z20.h\n" + "fmax z19.h, p5/M, z19.h, z20.h\n" "38:" // Height 3: No activation "st1h { z8.h }, p4, [x9]\n" "st1h { z9.h }, p3, [x9, #1, MUL VL]\n" @@ -1158,25 +1158,25 @@ void sve_hybrid_fp16_mla_6x4VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" + "add x22, x9, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z8.h }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x22]\n" + "ld1h { z13.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x21]\n" + "ld1h { z17.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x20]\n" + "ld1h { z21.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x20, #3, MUL VL]\n" "b 44f\n" "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -1200,14 +1200,14 @@ void sve_hybrid_fp16_mla_6x4VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1217,186 +1217,186 @@ void sve_hybrid_fp16_mla_6x4VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" "47:" // Height 4: input setup done "cmp x27, #0x8\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z3.h }, p0/Z, [x26]\n" + "ld1rqh { z2.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x24]\n" + "ld1rqh { z0.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[0]\n" + "fmla z12.h, z25.h, z2.h[0]\n" + "fmla z16.h, z25.h, z1.h[0]\n" + "fmla z20.h, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" + "fmla z9.h, z24.h, z3.h[0]\n" + "fmla z13.h, z24.h, z2.h[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z17.h, z24.h, z1.h[0]\n" + "fmla z21.h, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[0]\n" + "fmla z14.h, z25.h, z2.h[0]\n" + "fmla z18.h, z25.h, z1.h[0]\n" + "fmla z22.h, z25.h, z0.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[0]\n" + "fmla z15.h, z24.h, z2.h[0]\n" + "fmla z19.h, z24.h, z1.h[0]\n" + "fmla z23.h, z24.h, z0.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[1]\n" + "fmla z12.h, z25.h, z2.h[1]\n" + "fmla z16.h, z25.h, z1.h[1]\n" + "fmla z20.h, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[1]\n" + "fmla z13.h, z24.h, z2.h[1]\n" + "fmla z17.h, z24.h, z1.h[1]\n" + "fmla z21.h, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[1]\n" + "fmla z14.h, z25.h, z2.h[1]\n" + "fmla z18.h, z25.h, z1.h[1]\n" + "fmla z22.h, z25.h, z0.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[1]\n" + "fmla z15.h, z24.h, z2.h[1]\n" + "fmla z19.h, z24.h, z1.h[1]\n" + "fmla z23.h, z24.h, z0.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[2]\n" + "fmla z12.h, z25.h, z2.h[2]\n" + "fmla z16.h, z25.h, z1.h[2]\n" + "fmla z20.h, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[2]\n" + "fmla z13.h, z24.h, z2.h[2]\n" + "fmla z17.h, z24.h, z1.h[2]\n" + "fmla z21.h, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[2]\n" + "fmla z14.h, z25.h, z2.h[2]\n" + "fmla z18.h, z25.h, z1.h[2]\n" + "fmla z22.h, z25.h, z0.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[2]\n" + "fmla z15.h, z24.h, z2.h[2]\n" + "fmla z19.h, z24.h, z1.h[2]\n" + "fmla z23.h, z24.h, z0.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[3]\n" + "fmla z12.h, z25.h, z2.h[3]\n" + "fmla z16.h, z25.h, z1.h[3]\n" + "fmla z20.h, z25.h, z0.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[3]\n" + "fmla z13.h, z24.h, z2.h[3]\n" + "fmla z17.h, z24.h, z1.h[3]\n" + "fmla z21.h, z24.h, z0.h[3]\n" + "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[3]\n" + "fmla z14.h, z25.h, z2.h[3]\n" + "fmla z18.h, z25.h, z1.h[3]\n" + "fmla z22.h, z25.h, z0.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "fmla z11.h, z24.h, z3.h[3]\n" + "fmla z15.h, z24.h, z2.h[3]\n" + "fmla z19.h, z24.h, z1.h[3]\n" + "fmla z23.h, z24.h, z0.h[3]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[4]\n" + "fmla z12.h, z25.h, z2.h[4]\n" + "fmla z16.h, z25.h, z1.h[4]\n" + "fmla z20.h, z25.h, z0.h[4]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[4]\n" + "fmla z13.h, z24.h, z2.h[4]\n" + "fmla z17.h, z24.h, z1.h[4]\n" + "fmla z21.h, z24.h, z0.h[4]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[4]\n" + "fmla z14.h, z25.h, z2.h[4]\n" + "fmla z18.h, z25.h, z1.h[4]\n" + "fmla z22.h, z25.h, z0.h[4]\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[4]\n" + "fmla z15.h, z24.h, z2.h[4]\n" + "fmla z19.h, z24.h, z1.h[4]\n" + "fmla z23.h, z24.h, z0.h[4]\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[5]\n" + "fmla z12.h, z25.h, z2.h[5]\n" + "fmla z16.h, z25.h, z1.h[5]\n" + "fmla z20.h, z25.h, z0.h[5]\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[5]\n" + "fmla z13.h, z24.h, z2.h[5]\n" + "fmla z17.h, z24.h, z1.h[5]\n" + "fmla z21.h, z24.h, z0.h[5]\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z10.h, z25.h, z3.h[5]\n" + "fmla z14.h, z25.h, z2.h[5]\n" + "fmla z18.h, z25.h, z1.h[5]\n" + "fmla z22.h, z25.h, z0.h[5]\n" + "ld1h { z25.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[5]\n" + "fmla z15.h, z24.h, z2.h[5]\n" + "fmla z19.h, z24.h, z1.h[5]\n" + "fmla z23.h, z24.h, z0.h[5]\n" + "ld1h { z24.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[6]\n" + "fmla z12.h, z25.h, z2.h[6]\n" + "fmla z16.h, z25.h, z1.h[6]\n" + "fmla z20.h, z25.h, z0.h[6]\n" + "ld1h { z25.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[6]\n" + "fmla z13.h, z24.h, z2.h[6]\n" + "fmla z17.h, z24.h, z1.h[6]\n" + "fmla z21.h, z24.h, z0.h[6]\n" + "ld1h { z24.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[6]\n" + "fmla z14.h, z25.h, z2.h[6]\n" + "fmla z18.h, z25.h, z1.h[6]\n" + "fmla z22.h, z25.h, z0.h[6]\n" + "ld1h { z25.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z24.h, z3.h[6]\n" + "fmla z15.h, z24.h, z2.h[6]\n" + "fmla z19.h, z24.h, z1.h[6]\n" + "fmla z23.h, z24.h, z0.h[6]\n" + "ld1h { z24.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z25.h, z3.h[7]\n" + "fmla z12.h, z25.h, z2.h[7]\n" + "fmla z16.h, z25.h, z1.h[7]\n" + "fmla z20.h, z25.h, z0.h[7]\n" + "ld1h { z25.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z24.h, z3.h[7]\n" + "fmla z13.h, z24.h, z2.h[7]\n" + "fmla z17.h, z24.h, z1.h[7]\n" + "fmla z21.h, z24.h, z0.h[7]\n" + "ld1h { z24.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z25.h, z3.h[7]\n" + "fmla z14.h, z25.h, z2.h[7]\n" + "fmla z18.h, z25.h, z1.h[7]\n" + "fmla z22.h, z25.h, z0.h[7]\n" + "fmla z11.h, z24.h, z3.h[7]\n" + "fmla z15.h, z24.h, z2.h[7]\n" + "fmla z19.h, z24.h, z1.h[7]\n" + "fmla z23.h, z24.h, z0.h[7]\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1405,187 +1405,187 @@ void sve_hybrid_fp16_mla_6x4VL ( "subs x27, x27, #0x1\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[0]\n" + "fmla z12.h, z25.h, z1.h[0]\n" + "fmla z16.h, z25.h, z2.h[0]\n" + "fmla z20.h, z25.h, z3.h[0]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[0]\n" + "fmla z13.h, z24.h, z1.h[0]\n" + "fmla z17.h, z24.h, z2.h[0]\n" + "fmla z21.h, z24.h, z3.h[0]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" + "fmla z10.h, z25.h, z0.h[0]\n" + "fmla z14.h, z25.h, z1.h[0]\n" + "fmla z18.h, z25.h, z2.h[0]\n" + "fmla z22.h, z25.h, z3.h[0]\n" + "fmla z11.h, z24.h, z0.h[0]\n" + "fmla z15.h, z24.h, z1.h[0]\n" + "fmla z19.h, z24.h, z2.h[0]\n" + "fmla z23.h, z24.h, z3.h[0]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[1]\n" + "fmla z12.h, z25.h, z1.h[1]\n" + "fmla z16.h, z25.h, z2.h[1]\n" + "fmla z20.h, z25.h, z3.h[1]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[1]\n" + "fmla z13.h, z24.h, z1.h[1]\n" + "fmla z17.h, z24.h, z2.h[1]\n" + "fmla z21.h, z24.h, z3.h[1]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" + "fmla z10.h, z25.h, z0.h[1]\n" + "fmla z14.h, z25.h, z1.h[1]\n" + "fmla z18.h, z25.h, z2.h[1]\n" + "fmla z22.h, z25.h, z3.h[1]\n" + "fmla z11.h, z24.h, z0.h[1]\n" + "fmla z15.h, z24.h, z1.h[1]\n" + "fmla z19.h, z24.h, z2.h[1]\n" + "fmla z23.h, z24.h, z3.h[1]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[2]\n" + "fmla z12.h, z25.h, z1.h[2]\n" + "fmla z16.h, z25.h, z2.h[2]\n" + "fmla z20.h, z25.h, z3.h[2]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[2]\n" + "fmla z13.h, z24.h, z1.h[2]\n" + "fmla z17.h, z24.h, z2.h[2]\n" + "fmla z21.h, z24.h, z3.h[2]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" + "fmla z10.h, z25.h, z0.h[2]\n" + "fmla z14.h, z25.h, z1.h[2]\n" + "fmla z18.h, z25.h, z2.h[2]\n" + "fmla z22.h, z25.h, z3.h[2]\n" + "fmla z11.h, z24.h, z0.h[2]\n" + "fmla z15.h, z24.h, z1.h[2]\n" + "fmla z19.h, z24.h, z2.h[2]\n" + "fmla z23.h, z24.h, z3.h[2]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[3]\n" + "fmla z12.h, z25.h, z1.h[3]\n" + "fmla z16.h, z25.h, z2.h[3]\n" + "fmla z20.h, z25.h, z3.h[3]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[3]\n" + "fmla z13.h, z24.h, z1.h[3]\n" + "fmla z17.h, z24.h, z2.h[3]\n" + "fmla z21.h, z24.h, z3.h[3]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" + "fmla z10.h, z25.h, z0.h[3]\n" + "fmla z14.h, z25.h, z1.h[3]\n" + "fmla z18.h, z25.h, z2.h[3]\n" + "fmla z22.h, z25.h, z3.h[3]\n" + "fmla z11.h, z24.h, z0.h[3]\n" + "fmla z15.h, z24.h, z1.h[3]\n" + "fmla z19.h, z24.h, z2.h[3]\n" + "fmla z23.h, z24.h, z3.h[3]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[4]\n" + "fmla z12.h, z25.h, z1.h[4]\n" + "fmla z16.h, z25.h, z2.h[4]\n" + "fmla z20.h, z25.h, z3.h[4]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[4]\n" + "fmla z13.h, z24.h, z1.h[4]\n" + "fmla z17.h, z24.h, z2.h[4]\n" + "fmla z21.h, z24.h, z3.h[4]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" + "fmla z10.h, z25.h, z0.h[4]\n" + "fmla z14.h, z25.h, z1.h[4]\n" + "fmla z18.h, z25.h, z2.h[4]\n" + "fmla z22.h, z25.h, z3.h[4]\n" + "fmla z11.h, z24.h, z0.h[4]\n" + "fmla z15.h, z24.h, z1.h[4]\n" + "fmla z19.h, z24.h, z2.h[4]\n" + "fmla z23.h, z24.h, z3.h[4]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[5]\n" + "fmla z12.h, z25.h, z1.h[5]\n" + "fmla z16.h, z25.h, z2.h[5]\n" + "fmla z20.h, z25.h, z3.h[5]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[5]\n" + "fmla z13.h, z24.h, z1.h[5]\n" + "fmla z17.h, z24.h, z2.h[5]\n" + "fmla z21.h, z24.h, z3.h[5]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" + "fmla z10.h, z25.h, z0.h[5]\n" + "fmla z14.h, z25.h, z1.h[5]\n" + "fmla z18.h, z25.h, z2.h[5]\n" + "fmla z22.h, z25.h, z3.h[5]\n" + "fmla z11.h, z24.h, z0.h[5]\n" + "fmla z15.h, z24.h, z1.h[5]\n" + "fmla z19.h, z24.h, z2.h[5]\n" + "fmla z23.h, z24.h, z3.h[5]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[6]\n" + "fmla z12.h, z25.h, z1.h[6]\n" + "fmla z16.h, z25.h, z2.h[6]\n" + "fmla z20.h, z25.h, z3.h[6]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[6]\n" + "fmla z13.h, z24.h, z1.h[6]\n" + "fmla z17.h, z24.h, z2.h[6]\n" + "fmla z21.h, z24.h, z3.h[6]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" + "fmla z10.h, z25.h, z0.h[6]\n" + "fmla z14.h, z25.h, z1.h[6]\n" + "fmla z18.h, z25.h, z2.h[6]\n" + "fmla z22.h, z25.h, z3.h[6]\n" + "fmla z11.h, z24.h, z0.h[6]\n" + "fmla z15.h, z24.h, z1.h[6]\n" + "fmla z19.h, z24.h, z2.h[6]\n" + "fmla z23.h, z24.h, z3.h[6]\n" "ble 50f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z25.h, z0.h[7]\n" + "fmla z12.h, z25.h, z1.h[7]\n" + "fmla z16.h, z25.h, z2.h[7]\n" + "fmla z20.h, z25.h, z3.h[7]\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z24.h, z0.h[7]\n" + "fmla z13.h, z24.h, z1.h[7]\n" + "fmla z17.h, z24.h, z2.h[7]\n" + "fmla z21.h, z24.h, z3.h[7]\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" + "fmla z10.h, z25.h, z0.h[7]\n" + "fmla z14.h, z25.h, z1.h[7]\n" + "fmla z18.h, z25.h, z2.h[7]\n" + "fmla z22.h, z25.h, z3.h[7]\n" + "fmla z11.h, z24.h, z0.h[7]\n" + "fmla z15.h, z24.h, z1.h[7]\n" + "fmla z19.h, z24.h, z2.h[7]\n" + "fmla z23.h, z24.h, z3.h[7]\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1597,41 +1597,41 @@ void sve_hybrid_fp16_mla_6x4VL ( "add x23, x24, x20, LSL #1\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z25.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmin z20.h, p5/M, z20.h, z1.h\n" - "fmin z21.h, p5/M, z21.h, z1.h\n" - "fmin z22.h, p5/M, z22.h, z1.h\n" - "fmin z23.h, p5/M, z23.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" - "fmax z20.h, p5/M, z20.h, z0.h\n" - "fmax z21.h, p5/M, z21.h, z0.h\n" - "fmax z22.h, p5/M, z22.h, z0.h\n" - "fmax z23.h, p5/M, z23.h, z0.h\n" + "ld1rh { z24.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z25.h\n" + "fmin z9.h, p5/M, z9.h, z25.h\n" + "fmin z10.h, p5/M, z10.h, z25.h\n" + "fmin z11.h, p5/M, z11.h, z25.h\n" + "fmin z12.h, p5/M, z12.h, z25.h\n" + "fmin z13.h, p5/M, z13.h, z25.h\n" + "fmin z14.h, p5/M, z14.h, z25.h\n" + "fmin z15.h, p5/M, z15.h, z25.h\n" + "fmin z16.h, p5/M, z16.h, z25.h\n" + "fmin z17.h, p5/M, z17.h, z25.h\n" + "fmin z18.h, p5/M, z18.h, z25.h\n" + "fmin z19.h, p5/M, z19.h, z25.h\n" + "fmin z20.h, p5/M, z20.h, z25.h\n" + "fmin z21.h, p5/M, z21.h, z25.h\n" + "fmin z22.h, p5/M, z22.h, z25.h\n" + "fmin z23.h, p5/M, z23.h, z25.h\n" + "fmax z8.h, p5/M, z8.h, z24.h\n" + "fmax z9.h, p5/M, z9.h, z24.h\n" + "fmax z10.h, p5/M, z10.h, z24.h\n" + "fmax z11.h, p5/M, z11.h, z24.h\n" + "fmax z12.h, p5/M, z12.h, z24.h\n" + "fmax z13.h, p5/M, z13.h, z24.h\n" + "fmax z14.h, p5/M, z14.h, z24.h\n" + "fmax z15.h, p5/M, z15.h, z24.h\n" + "fmax z16.h, p5/M, z16.h, z24.h\n" + "fmax z17.h, p5/M, z17.h, z24.h\n" + "fmax z18.h, p5/M, z18.h, z24.h\n" + "fmax z19.h, p5/M, z19.h, z24.h\n" + "fmax z20.h, p5/M, z20.h, z24.h\n" + "fmax z21.h, p5/M, z21.h, z24.h\n" + "fmax z22.h, p5/M, z22.h, z24.h\n" + "fmax z23.h, p5/M, z23.h, z24.h\n" "51:" // Height 4: No activation "st1h { z8.h }, p4, [x9]\n" "st1h { z9.h }, p3, [x9, #1, MUL VL]\n" @@ -1695,30 +1695,30 @@ void sve_hybrid_fp16_mla_6x4VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #1\n" + "add x23, x9, x20, LSL #1\n" "add x22, x23, x20, LSL #1\n" + "ld1h { z8.h }, p4/Z, [x9]\n" + "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n" "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x22]\n" - "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x23]\n" + "ld1h { z13.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x22]\n" + "ld1h { z17.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x21]\n" + "ld1h { z21.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x20]\n" + "ld1h { z25.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x20, #3, MUL VL]\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1746,15 +1746,15 @@ void sve_hybrid_fp16_mla_6x4VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -1765,221 +1765,221 @@ void sve_hybrid_fp16_mla_6x4VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" "60:" // Height 5: input setup done "cmp x27, #0x8\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z4.h }, p0/Z, [x26]\n" + "ld1rqh { z3.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z1.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1rqh { z0.h }, p0/Z, [x22]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "fmla z8.h, z29.h, z4.h[0]\n" + "fmla z12.h, z29.h, z3.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.h, z29.h, z2.h[0]\n" + "fmla z20.h, z29.h, z1.h[0]\n" "add x25, x25, #0x10\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z24.h, z29.h, z0.h[0]\n" + "fmla z9.h, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" + "fmla z13.h, z28.h, z3.h[0]\n" + "fmla z17.h, z28.h, z2.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z21.h, z28.h, z1.h[0]\n" + "fmla z25.h, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[0]\n" + "fmla z14.h, z29.h, z3.h[0]\n" + "fmla z18.h, z29.h, z2.h[0]\n" + "fmla z22.h, z29.h, z1.h[0]\n" + "fmla z26.h, z29.h, z0.h[0]\n" + "fmla z11.h, z28.h, z4.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[0]\n" + "fmla z19.h, z28.h, z2.h[0]\n" + "fmla z23.h, z28.h, z1.h[0]\n" + "fmla z27.h, z28.h, z0.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[1]\n" + "fmla z12.h, z29.h, z3.h[1]\n" + "fmla z16.h, z29.h, z2.h[1]\n" + "fmla z20.h, z29.h, z1.h[1]\n" + "fmla z24.h, z29.h, z0.h[1]\n" + "fmla z9.h, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[1]\n" + "fmla z17.h, z28.h, z2.h[1]\n" + "fmla z21.h, z28.h, z1.h[1]\n" + "fmla z25.h, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[1]\n" + "fmla z14.h, z29.h, z3.h[1]\n" + "fmla z18.h, z29.h, z2.h[1]\n" + "fmla z22.h, z29.h, z1.h[1]\n" + "fmla z26.h, z29.h, z0.h[1]\n" + "fmla z11.h, z28.h, z4.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[1]\n" + "fmla z19.h, z28.h, z2.h[1]\n" + "fmla z23.h, z28.h, z1.h[1]\n" + "fmla z27.h, z28.h, z0.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[2]\n" + "fmla z12.h, z29.h, z3.h[2]\n" + "fmla z16.h, z29.h, z2.h[2]\n" + "fmla z20.h, z29.h, z1.h[2]\n" + "fmla z24.h, z29.h, z0.h[2]\n" + "fmla z9.h, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[2]\n" + "fmla z17.h, z28.h, z2.h[2]\n" + "fmla z21.h, z28.h, z1.h[2]\n" + "fmla z25.h, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[2]\n" + "fmla z14.h, z29.h, z3.h[2]\n" + "fmla z18.h, z29.h, z2.h[2]\n" + "fmla z22.h, z29.h, z1.h[2]\n" + "fmla z26.h, z29.h, z0.h[2]\n" + "fmla z11.h, z28.h, z4.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[2]\n" + "fmla z19.h, z28.h, z2.h[2]\n" + "fmla z23.h, z28.h, z1.h[2]\n" + "fmla z27.h, z28.h, z0.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[3]\n" + "fmla z12.h, z29.h, z3.h[3]\n" + "fmla z16.h, z29.h, z2.h[3]\n" + "fmla z20.h, z29.h, z1.h[3]\n" + "fmla z24.h, z29.h, z0.h[3]\n" + "fmla z9.h, z28.h, z4.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[3]\n" + "fmla z17.h, z28.h, z2.h[3]\n" + "fmla z21.h, z28.h, z1.h[3]\n" + "fmla z25.h, z28.h, z0.h[3]\n" + "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[3]\n" + "fmla z14.h, z29.h, z3.h[3]\n" + "fmla z18.h, z29.h, z2.h[3]\n" + "fmla z22.h, z29.h, z1.h[3]\n" + "fmla z26.h, z29.h, z0.h[3]\n" + "fmla z11.h, z28.h, z4.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "fmla z15.h, z28.h, z3.h[3]\n" + "fmla z19.h, z28.h, z2.h[3]\n" + "fmla z23.h, z28.h, z1.h[3]\n" + "fmla z27.h, z28.h, z0.h[3]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[4]\n" + "fmla z12.h, z29.h, z3.h[4]\n" + "fmla z16.h, z29.h, z2.h[4]\n" + "fmla z20.h, z29.h, z1.h[4]\n" + "fmla z24.h, z29.h, z0.h[4]\n" + "fmla z9.h, z28.h, z4.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[4]\n" + "fmla z17.h, z28.h, z2.h[4]\n" + "fmla z21.h, z28.h, z1.h[4]\n" + "fmla z25.h, z28.h, z0.h[4]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[4]\n" + "fmla z14.h, z29.h, z3.h[4]\n" + "fmla z18.h, z29.h, z2.h[4]\n" + "fmla z22.h, z29.h, z1.h[4]\n" + "fmla z26.h, z29.h, z0.h[4]\n" + "fmla z11.h, z28.h, z4.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[4]\n" + "fmla z19.h, z28.h, z2.h[4]\n" + "fmla z23.h, z28.h, z1.h[4]\n" + "fmla z27.h, z28.h, z0.h[4]\n" + "ld1h { z28.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[5]\n" + "fmla z12.h, z29.h, z3.h[5]\n" + "fmla z16.h, z29.h, z2.h[5]\n" + "fmla z20.h, z29.h, z1.h[5]\n" + "fmla z24.h, z29.h, z0.h[5]\n" + "fmla z9.h, z28.h, z4.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[5]\n" + "fmla z17.h, z28.h, z2.h[5]\n" + "fmla z21.h, z28.h, z1.h[5]\n" + "fmla z25.h, z28.h, z0.h[5]\n" + "ld1h { z28.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z10.h, z29.h, z4.h[5]\n" + "fmla z14.h, z29.h, z3.h[5]\n" + "fmla z18.h, z29.h, z2.h[5]\n" + "fmla z22.h, z29.h, z1.h[5]\n" + "fmla z26.h, z29.h, z0.h[5]\n" + "fmla z11.h, z28.h, z4.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[5]\n" + "fmla z19.h, z28.h, z2.h[5]\n" + "fmla z23.h, z28.h, z1.h[5]\n" + "fmla z27.h, z28.h, z0.h[5]\n" + "ld1h { z28.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[6]\n" + "fmla z12.h, z29.h, z3.h[6]\n" + "fmla z16.h, z29.h, z2.h[6]\n" + "fmla z20.h, z29.h, z1.h[6]\n" + "fmla z24.h, z29.h, z0.h[6]\n" + "fmla z9.h, z28.h, z4.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[6]\n" + "fmla z17.h, z28.h, z2.h[6]\n" + "fmla z21.h, z28.h, z1.h[6]\n" + "fmla z25.h, z28.h, z0.h[6]\n" + "ld1h { z28.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[6]\n" + "fmla z14.h, z29.h, z3.h[6]\n" + "fmla z18.h, z29.h, z2.h[6]\n" + "fmla z22.h, z29.h, z1.h[6]\n" + "fmla z26.h, z29.h, z0.h[6]\n" + "fmla z11.h, z28.h, z4.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.h, z28.h, z3.h[6]\n" + "fmla z19.h, z28.h, z2.h[6]\n" + "fmla z23.h, z28.h, z1.h[6]\n" + "fmla z27.h, z28.h, z0.h[6]\n" + "ld1h { z28.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z29.h, z4.h[7]\n" + "fmla z12.h, z29.h, z3.h[7]\n" + "fmla z16.h, z29.h, z2.h[7]\n" + "fmla z20.h, z29.h, z1.h[7]\n" + "fmla z24.h, z29.h, z0.h[7]\n" + "fmla z9.h, z28.h, z4.h[7]\n" + "ld1h { z29.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.h, z28.h, z3.h[7]\n" + "fmla z17.h, z28.h, z2.h[7]\n" + "fmla z21.h, z28.h, z1.h[7]\n" + "fmla z25.h, z28.h, z0.h[7]\n" + "ld1h { z28.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z29.h, z4.h[7]\n" + "fmla z14.h, z29.h, z3.h[7]\n" + "fmla z18.h, z29.h, z2.h[7]\n" + "fmla z22.h, z29.h, z1.h[7]\n" + "fmla z26.h, z29.h, z0.h[7]\n" + "fmla z11.h, z28.h, z4.h[7]\n" + "fmla z15.h, z28.h, z3.h[7]\n" + "fmla z19.h, z28.h, z2.h[7]\n" + "fmla z23.h, z28.h, z1.h[7]\n" + "fmla z27.h, z28.h, z0.h[7]\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -1989,219 +1989,219 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z2.h }, p0/Z, [x24]\n" "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "fmla z8.h, z29.h, z0.h[0]\n" + "fmla z12.h, z29.h, z1.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.h, z29.h, z2.h[0]\n" + "fmla z20.h, z29.h, z3.h[0]\n" + "fmla z24.h, z29.h, z4.h[0]\n" + "fmla z9.h, z28.h, z0.h[0]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[0]\n" + "fmla z17.h, z28.h, z2.h[0]\n" + "fmla z21.h, z28.h, z3.h[0]\n" + "fmla z25.h, z28.h, z4.h[0]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" + "fmla z10.h, z29.h, z0.h[0]\n" + "fmla z14.h, z29.h, z1.h[0]\n" + "fmla z18.h, z29.h, z2.h[0]\n" + "fmla z22.h, z29.h, z3.h[0]\n" + "fmla z26.h, z29.h, z4.h[0]\n" + "fmla z11.h, z28.h, z0.h[0]\n" + "fmla z15.h, z28.h, z1.h[0]\n" + "fmla z19.h, z28.h, z2.h[0]\n" + "fmla z23.h, z28.h, z3.h[0]\n" + "fmla z27.h, z28.h, z4.h[0]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[1]\n" + "fmla z12.h, z29.h, z1.h[1]\n" + "fmla z16.h, z29.h, z2.h[1]\n" + "fmla z20.h, z29.h, z3.h[1]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[1]\n" + "fmla z9.h, z28.h, z0.h[1]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[1]\n" + "fmla z17.h, z28.h, z2.h[1]\n" + "fmla z21.h, z28.h, z3.h[1]\n" + "fmla z25.h, z28.h, z4.h[1]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" + "fmla z10.h, z29.h, z0.h[1]\n" + "fmla z14.h, z29.h, z1.h[1]\n" + "fmla z18.h, z29.h, z2.h[1]\n" + "fmla z22.h, z29.h, z3.h[1]\n" + "fmla z26.h, z29.h, z4.h[1]\n" + "fmla z11.h, z28.h, z0.h[1]\n" + "fmla z15.h, z28.h, z1.h[1]\n" + "fmla z19.h, z28.h, z2.h[1]\n" + "fmla z23.h, z28.h, z3.h[1]\n" + "fmla z27.h, z28.h, z4.h[1]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[2]\n" + "fmla z12.h, z29.h, z1.h[2]\n" + "fmla z16.h, z29.h, z2.h[2]\n" + "fmla z20.h, z29.h, z3.h[2]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[2]\n" + "fmla z9.h, z28.h, z0.h[2]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[2]\n" + "fmla z17.h, z28.h, z2.h[2]\n" + "fmla z21.h, z28.h, z3.h[2]\n" + "fmla z25.h, z28.h, z4.h[2]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" + "fmla z10.h, z29.h, z0.h[2]\n" + "fmla z14.h, z29.h, z1.h[2]\n" + "fmla z18.h, z29.h, z2.h[2]\n" + "fmla z22.h, z29.h, z3.h[2]\n" + "fmla z26.h, z29.h, z4.h[2]\n" + "fmla z11.h, z28.h, z0.h[2]\n" + "fmla z15.h, z28.h, z1.h[2]\n" + "fmla z19.h, z28.h, z2.h[2]\n" + "fmla z23.h, z28.h, z3.h[2]\n" + "fmla z27.h, z28.h, z4.h[2]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[3]\n" + "fmla z12.h, z29.h, z1.h[3]\n" + "fmla z16.h, z29.h, z2.h[3]\n" + "fmla z20.h, z29.h, z3.h[3]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[3]\n" + "fmla z9.h, z28.h, z0.h[3]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[3]\n" + "fmla z17.h, z28.h, z2.h[3]\n" + "fmla z21.h, z28.h, z3.h[3]\n" + "fmla z25.h, z28.h, z4.h[3]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" + "fmla z10.h, z29.h, z0.h[3]\n" + "fmla z14.h, z29.h, z1.h[3]\n" + "fmla z18.h, z29.h, z2.h[3]\n" + "fmla z22.h, z29.h, z3.h[3]\n" + "fmla z26.h, z29.h, z4.h[3]\n" + "fmla z11.h, z28.h, z0.h[3]\n" + "fmla z15.h, z28.h, z1.h[3]\n" + "fmla z19.h, z28.h, z2.h[3]\n" + "fmla z23.h, z28.h, z3.h[3]\n" + "fmla z27.h, z28.h, z4.h[3]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[4]\n" + "fmla z12.h, z29.h, z1.h[4]\n" + "fmla z16.h, z29.h, z2.h[4]\n" + "fmla z20.h, z29.h, z3.h[4]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[4]\n" + "fmla z9.h, z28.h, z0.h[4]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[4]\n" + "fmla z17.h, z28.h, z2.h[4]\n" + "fmla z21.h, z28.h, z3.h[4]\n" + "fmla z25.h, z28.h, z4.h[4]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" + "fmla z10.h, z29.h, z0.h[4]\n" + "fmla z14.h, z29.h, z1.h[4]\n" + "fmla z18.h, z29.h, z2.h[4]\n" + "fmla z22.h, z29.h, z3.h[4]\n" + "fmla z26.h, z29.h, z4.h[4]\n" + "fmla z11.h, z28.h, z0.h[4]\n" + "fmla z15.h, z28.h, z1.h[4]\n" + "fmla z19.h, z28.h, z2.h[4]\n" + "fmla z23.h, z28.h, z3.h[4]\n" + "fmla z27.h, z28.h, z4.h[4]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[5]\n" + "fmla z12.h, z29.h, z1.h[5]\n" + "fmla z16.h, z29.h, z2.h[5]\n" + "fmla z20.h, z29.h, z3.h[5]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[5]\n" + "fmla z9.h, z28.h, z0.h[5]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[5]\n" + "fmla z17.h, z28.h, z2.h[5]\n" + "fmla z21.h, z28.h, z3.h[5]\n" + "fmla z25.h, z28.h, z4.h[5]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" + "fmla z10.h, z29.h, z0.h[5]\n" + "fmla z14.h, z29.h, z1.h[5]\n" + "fmla z18.h, z29.h, z2.h[5]\n" + "fmla z22.h, z29.h, z3.h[5]\n" + "fmla z26.h, z29.h, z4.h[5]\n" + "fmla z11.h, z28.h, z0.h[5]\n" + "fmla z15.h, z28.h, z1.h[5]\n" + "fmla z19.h, z28.h, z2.h[5]\n" + "fmla z23.h, z28.h, z3.h[5]\n" + "fmla z27.h, z28.h, z4.h[5]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[6]\n" + "fmla z12.h, z29.h, z1.h[6]\n" + "fmla z16.h, z29.h, z2.h[6]\n" + "fmla z20.h, z29.h, z3.h[6]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z29.h, z4.h[6]\n" + "fmla z9.h, z28.h, z0.h[6]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[6]\n" + "fmla z17.h, z28.h, z2.h[6]\n" + "fmla z21.h, z28.h, z3.h[6]\n" + "fmla z25.h, z28.h, z4.h[6]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" + "fmla z10.h, z29.h, z0.h[6]\n" + "fmla z14.h, z29.h, z1.h[6]\n" + "fmla z18.h, z29.h, z2.h[6]\n" + "fmla z22.h, z29.h, z3.h[6]\n" + "fmla z26.h, z29.h, z4.h[6]\n" + "fmla z11.h, z28.h, z0.h[6]\n" + "fmla z15.h, z28.h, z1.h[6]\n" + "fmla z19.h, z28.h, z2.h[6]\n" + "fmla z23.h, z28.h, z3.h[6]\n" + "fmla z27.h, z28.h, z4.h[6]\n" "ble 63f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z29.h }, p5/Z, [x10]\n" + "ld1h { z28.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z29.h, z0.h[7]\n" + "fmla z12.h, z29.h, z1.h[7]\n" + "fmla z16.h, z29.h, z2.h[7]\n" + "fmla z20.h, z29.h, z3.h[7]\n" + "fmla z24.h, z29.h, z4.h[7]\n" + "fmla z9.h, z28.h, z0.h[7]\n" + "ld1h { z29.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.h, z28.h, z1.h[7]\n" + "fmla z17.h, z28.h, z2.h[7]\n" + "fmla z21.h, z28.h, z3.h[7]\n" + "fmla z25.h, z28.h, z4.h[7]\n" + "ld1h { z28.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" + "fmla z10.h, z29.h, z0.h[7]\n" + "fmla z14.h, z29.h, z1.h[7]\n" + "fmla z18.h, z29.h, z2.h[7]\n" + "fmla z22.h, z29.h, z3.h[7]\n" + "fmla z26.h, z29.h, z4.h[7]\n" + "fmla z11.h, z28.h, z0.h[7]\n" + "fmla z15.h, z28.h, z1.h[7]\n" + "fmla z19.h, z28.h, z2.h[7]\n" + "fmla z23.h, z28.h, z3.h[7]\n" + "fmla z27.h, z28.h, z4.h[7]\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2214,49 +2214,49 @@ void sve_hybrid_fp16_mla_6x4VL ( "add x22, x23, x20, LSL #1\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rh { z1.h }, p5/Z, [x20]\n" + "ld1rh { z29.h }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rh { z0.h }, p5/Z, [x20]\n" - "fmin z8.h, p5/M, z8.h, z1.h\n" - "fmin z9.h, p5/M, z9.h, z1.h\n" - "fmin z10.h, p5/M, z10.h, z1.h\n" - "fmin z11.h, p5/M, z11.h, z1.h\n" - "fmin z12.h, p5/M, z12.h, z1.h\n" - "fmin z13.h, p5/M, z13.h, z1.h\n" - "fmin z14.h, p5/M, z14.h, z1.h\n" - "fmin z15.h, p5/M, z15.h, z1.h\n" - "fmin z16.h, p5/M, z16.h, z1.h\n" - "fmin z17.h, p5/M, z17.h, z1.h\n" - "fmin z18.h, p5/M, z18.h, z1.h\n" - "fmin z19.h, p5/M, z19.h, z1.h\n" - "fmin z20.h, p5/M, z20.h, z1.h\n" - "fmin z21.h, p5/M, z21.h, z1.h\n" - "fmin z22.h, p5/M, z22.h, z1.h\n" - "fmin z23.h, p5/M, z23.h, z1.h\n" - "fmin z24.h, p5/M, z24.h, z1.h\n" - "fmin z25.h, p5/M, z25.h, z1.h\n" - "fmin z26.h, p5/M, z26.h, z1.h\n" - "fmin z27.h, p5/M, z27.h, z1.h\n" - "fmax z8.h, p5/M, z8.h, z0.h\n" - "fmax z9.h, p5/M, z9.h, z0.h\n" - "fmax z10.h, p5/M, z10.h, z0.h\n" - "fmax z11.h, p5/M, z11.h, z0.h\n" - "fmax z12.h, p5/M, z12.h, z0.h\n" - "fmax z13.h, p5/M, z13.h, z0.h\n" - "fmax z14.h, p5/M, z14.h, z0.h\n" - "fmax z15.h, p5/M, z15.h, z0.h\n" - "fmax z16.h, p5/M, z16.h, z0.h\n" - "fmax z17.h, p5/M, z17.h, z0.h\n" - "fmax z18.h, p5/M, z18.h, z0.h\n" - "fmax z19.h, p5/M, z19.h, z0.h\n" - "fmax z20.h, p5/M, z20.h, z0.h\n" - "fmax z21.h, p5/M, z21.h, z0.h\n" - "fmax z22.h, p5/M, z22.h, z0.h\n" - "fmax z23.h, p5/M, z23.h, z0.h\n" - "fmax z24.h, p5/M, z24.h, z0.h\n" - "fmax z25.h, p5/M, z25.h, z0.h\n" - "fmax z26.h, p5/M, z26.h, z0.h\n" - "fmax z27.h, p5/M, z27.h, z0.h\n" + "ld1rh { z28.h }, p5/Z, [x20]\n" + "fmin z8.h, p5/M, z8.h, z29.h\n" + "fmin z9.h, p5/M, z9.h, z29.h\n" + "fmin z10.h, p5/M, z10.h, z29.h\n" + "fmin z11.h, p5/M, z11.h, z29.h\n" + "fmin z12.h, p5/M, z12.h, z29.h\n" + "fmin z13.h, p5/M, z13.h, z29.h\n" + "fmin z14.h, p5/M, z14.h, z29.h\n" + "fmin z15.h, p5/M, z15.h, z29.h\n" + "fmin z16.h, p5/M, z16.h, z29.h\n" + "fmin z17.h, p5/M, z17.h, z29.h\n" + "fmin z18.h, p5/M, z18.h, z29.h\n" + "fmin z19.h, p5/M, z19.h, z29.h\n" + "fmin z20.h, p5/M, z20.h, z29.h\n" + "fmin z21.h, p5/M, z21.h, z29.h\n" + "fmin z22.h, p5/M, z22.h, z29.h\n" + "fmin z23.h, p5/M, z23.h, z29.h\n" + "fmin z24.h, p5/M, z24.h, z29.h\n" + "fmin z25.h, p5/M, z25.h, z29.h\n" + "fmin z26.h, p5/M, z26.h, z29.h\n" + "fmin z27.h, p5/M, z27.h, z29.h\n" + "fmax z8.h, p5/M, z8.h, z28.h\n" + "fmax z9.h, p5/M, z9.h, z28.h\n" + "fmax z10.h, p5/M, z10.h, z28.h\n" + "fmax z11.h, p5/M, z11.h, z28.h\n" + "fmax z12.h, p5/M, z12.h, z28.h\n" + "fmax z13.h, p5/M, z13.h, z28.h\n" + "fmax z14.h, p5/M, z14.h, z28.h\n" + "fmax z15.h, p5/M, z15.h, z28.h\n" + "fmax z16.h, p5/M, z16.h, z28.h\n" + "fmax z17.h, p5/M, z17.h, z28.h\n" + "fmax z18.h, p5/M, z18.h, z28.h\n" + "fmax z19.h, p5/M, z19.h, z28.h\n" + "fmax z20.h, p5/M, z20.h, z28.h\n" + "fmax z21.h, p5/M, z21.h, z28.h\n" + "fmax z22.h, p5/M, z22.h, z28.h\n" + "fmax z23.h, p5/M, z23.h, z28.h\n" + "fmax z24.h, p5/M, z24.h, z28.h\n" + "fmax z25.h, p5/M, z25.h, z28.h\n" + "fmax z26.h, p5/M, z26.h, z28.h\n" + "fmax z27.h, p5/M, z27.h, z28.h\n" "64:" // Height 5: No activation "st1h { z8.h }, p4, [x9]\n" "st1h { z9.h }, p3, [x9, #1, MUL VL]\n" @@ -2331,35 +2331,35 @@ void sve_hybrid_fp16_mla_6x4VL ( "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "ld1h { z8.h }, p4/Z, [x9]\n" + "add x24, x9, x20, LSL #1\n" "add x23, x24, x20, LSL #1\n" + "ld1h { z8.h }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #1\n" + "add x21, x22, x20, LSL #1\n" "ld1h { z9.h }, p3/Z, [x9, #1, MUL VL]\n" "ld1h { z10.h }, p2/Z, [x9, #2, MUL VL]\n" - "add x21, x22, x20, LSL #1\n" + "add x20, x21, x20, LSL #1\n" "ld1h { z11.h }, p1/Z, [x9, #3, MUL VL]\n" - "ld1h { z12.h }, p4/Z, [x25]\n" - "ld1h { z13.h }, p3/Z, [x25, #1, MUL VL]\n" - "ld1h { z14.h }, p2/Z, [x25, #2, MUL VL]\n" - "ld1h { z15.h }, p1/Z, [x25, #3, MUL VL]\n" - "ld1h { z16.h }, p4/Z, [x24]\n" - "ld1h { z17.h }, p3/Z, [x24, #1, MUL VL]\n" - "ld1h { z18.h }, p2/Z, [x24, #2, MUL VL]\n" - "ld1h { z19.h }, p1/Z, [x24, #3, MUL VL]\n" - "ld1h { z20.h }, p4/Z, [x23]\n" - "ld1h { z21.h }, p3/Z, [x23, #1, MUL VL]\n" - "ld1h { z22.h }, p2/Z, [x23, #2, MUL VL]\n" - "ld1h { z23.h }, p1/Z, [x23, #3, MUL VL]\n" - "ld1h { z24.h }, p4/Z, [x22]\n" - "ld1h { z25.h }, p3/Z, [x22, #1, MUL VL]\n" - "ld1h { z26.h }, p2/Z, [x22, #2, MUL VL]\n" - "ld1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n" - "ld1h { z28.h }, p4/Z, [x21]\n" - "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n" - "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n" - "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z12.h }, p4/Z, [x24]\n" + "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n" + "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n" + "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n" + "ld1h { z16.h }, p4/Z, [x23]\n" + "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n" + "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n" + "ld1h { z20.h }, p4/Z, [x22]\n" + "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n" + "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n" + "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n" + "ld1h { z24.h }, p4/Z, [x21]\n" + "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n" + "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n" + "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n" + "ld1h { z28.h }, p4/Z, [x20]\n" + "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n" + "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n" + "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -2391,16 +2391,16 @@ void sve_hybrid_fp16_mla_6x4VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #1\n" @@ -2412,256 +2412,256 @@ void sve_hybrid_fp16_mla_6x4VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #1\n" - "add x24, x25, x20, LSL #1\n" - "add x23, x24, x20, LSL #1\n" - "add x22, x23, x20, LSL #1\n" - "add x21, x22, x20, LSL #1\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" "73:" // Height 6: input setup done "cmp x27, #0x8\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.h, XZR, x27\n" - "ld1rqh { z0.h }, p0/Z, [x26]\n" - "ld1rqh { z1.h }, p0/Z, [x25]\n" + "ld1rqh { z7.h }, p0/Z, [x26]\n" + "ld1rqh { z6.h }, p0/Z, [x25]\n" "sub x27, x27, #0x8\n" - "ld1rqh { z2.h }, p0/Z, [x24]\n" - "ld1rqh { z3.h }, p0/Z, [x23]\n" + "ld1rqh { z5.h }, p0/Z, [x24]\n" + "ld1rqh { z4.h }, p0/Z, [x23]\n" "cmp x27, #0x8\n" "add x26, x26, #0x10\n" - "ld1rqh { z4.h }, p0/Z, [x22]\n" - "ld1rqh { z5.h }, p0/Z, [x21]\n" + "ld1rqh { z3.h }, p0/Z, [x22]\n" + "ld1rqh { z2.h }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[0]\n" + "fmla z12.h, z1.h, z6.h[0]\n" + "fmla z16.h, z1.h, z5.h[0]\n" + "fmla z20.h, z1.h, z4.h[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z24.h, z1.h, z3.h[0]\n" + "fmla z28.h, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" "add x21, x21, #0x10\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z30.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "fmla z31.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[0]\n" + "fmla z13.h, z0.h, z6.h[0]\n" + "fmla z17.h, z0.h, z5.h[0]\n" + "fmla z21.h, z0.h, z4.h[0]\n" + "fmla z25.h, z0.h, z3.h[0]\n" + "fmla z29.h, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[0]\n" + "fmla z14.h, z1.h, z6.h[0]\n" + "fmla z18.h, z1.h, z5.h[0]\n" + "fmla z22.h, z1.h, z4.h[0]\n" + "fmla z26.h, z1.h, z3.h[0]\n" + "fmla z30.h, z1.h, z2.h[0]\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[0]\n" + "fmla z15.h, z0.h, z6.h[0]\n" + "fmla z19.h, z0.h, z5.h[0]\n" + "fmla z23.h, z0.h, z4.h[0]\n" + "fmla z27.h, z0.h, z3.h[0]\n" + "fmla z31.h, z0.h, z2.h[0]\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[1]\n" + "fmla z12.h, z1.h, z6.h[1]\n" + "fmla z16.h, z1.h, z5.h[1]\n" + "fmla z20.h, z1.h, z4.h[1]\n" + "fmla z24.h, z1.h, z3.h[1]\n" + "fmla z28.h, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[1]\n" + "fmla z13.h, z0.h, z6.h[1]\n" + "fmla z17.h, z0.h, z5.h[1]\n" + "fmla z21.h, z0.h, z4.h[1]\n" + "fmla z25.h, z0.h, z3.h[1]\n" + "fmla z29.h, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z30.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "fmla z31.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z30.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "fmla z31.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z30.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "fmla z31.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z30.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "fmla z31.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[1]\n" + "fmla z14.h, z1.h, z6.h[1]\n" + "fmla z18.h, z1.h, z5.h[1]\n" + "fmla z22.h, z1.h, z4.h[1]\n" + "fmla z26.h, z1.h, z3.h[1]\n" + "fmla z30.h, z1.h, z2.h[1]\n" + "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[1]\n" + "fmla z15.h, z0.h, z6.h[1]\n" + "fmla z19.h, z0.h, z5.h[1]\n" + "fmla z23.h, z0.h, z4.h[1]\n" + "fmla z27.h, z0.h, z3.h[1]\n" + "fmla z31.h, z0.h, z2.h[1]\n" + "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[2]\n" + "fmla z12.h, z1.h, z6.h[2]\n" + "fmla z16.h, z1.h, z5.h[2]\n" + "fmla z20.h, z1.h, z4.h[2]\n" + "fmla z24.h, z1.h, z3.h[2]\n" + "fmla z28.h, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[2]\n" + "fmla z13.h, z0.h, z6.h[2]\n" + "fmla z17.h, z0.h, z5.h[2]\n" + "fmla z21.h, z0.h, z4.h[2]\n" + "fmla z25.h, z0.h, z3.h[2]\n" + "fmla z29.h, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[2]\n" + "fmla z14.h, z1.h, z6.h[2]\n" + "fmla z18.h, z1.h, z5.h[2]\n" + "fmla z22.h, z1.h, z4.h[2]\n" + "fmla z26.h, z1.h, z3.h[2]\n" + "fmla z30.h, z1.h, z2.h[2]\n" + "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[2]\n" + "fmla z15.h, z0.h, z6.h[2]\n" + "fmla z19.h, z0.h, z5.h[2]\n" + "fmla z23.h, z0.h, z4.h[2]\n" + "fmla z27.h, z0.h, z3.h[2]\n" + "fmla z31.h, z0.h, z2.h[2]\n" + "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[3]\n" + "fmla z12.h, z1.h, z6.h[3]\n" + "fmla z16.h, z1.h, z5.h[3]\n" + "fmla z20.h, z1.h, z4.h[3]\n" + "fmla z24.h, z1.h, z3.h[3]\n" + "fmla z28.h, z1.h, z2.h[3]\n" + "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[3]\n" + "fmla z13.h, z0.h, z6.h[3]\n" + "fmla z17.h, z0.h, z5.h[3]\n" + "fmla z21.h, z0.h, z4.h[3]\n" + "fmla z25.h, z0.h, z3.h[3]\n" + "fmla z29.h, z0.h, z2.h[3]\n" + "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[3]\n" + "fmla z14.h, z1.h, z6.h[3]\n" + "fmla z18.h, z1.h, z5.h[3]\n" + "fmla z22.h, z1.h, z4.h[3]\n" + "fmla z26.h, z1.h, z3.h[3]\n" + "fmla z30.h, z1.h, z2.h[3]\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "fmla z11.h, z0.h, z7.h[3]\n" + "fmla z15.h, z0.h, z6.h[3]\n" + "fmla z19.h, z0.h, z5.h[3]\n" + "fmla z23.h, z0.h, z4.h[3]\n" + "fmla z27.h, z0.h, z3.h[3]\n" + "fmla z31.h, z0.h, z2.h[3]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[4]\n" + "fmla z12.h, z1.h, z6.h[4]\n" + "fmla z16.h, z1.h, z5.h[4]\n" + "fmla z20.h, z1.h, z4.h[4]\n" + "fmla z24.h, z1.h, z3.h[4]\n" + "fmla z28.h, z1.h, z2.h[4]\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[4]\n" + "fmla z13.h, z0.h, z6.h[4]\n" + "fmla z17.h, z0.h, z5.h[4]\n" + "fmla z21.h, z0.h, z4.h[4]\n" + "fmla z25.h, z0.h, z3.h[4]\n" + "fmla z29.h, z0.h, z2.h[4]\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[4]\n" + "fmla z14.h, z1.h, z6.h[4]\n" + "fmla z18.h, z1.h, z5.h[4]\n" + "fmla z22.h, z1.h, z4.h[4]\n" + "fmla z26.h, z1.h, z3.h[4]\n" + "fmla z30.h, z1.h, z2.h[4]\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[4]\n" + "fmla z15.h, z0.h, z6.h[4]\n" + "fmla z19.h, z0.h, z5.h[4]\n" + "fmla z23.h, z0.h, z4.h[4]\n" + "fmla z27.h, z0.h, z3.h[4]\n" + "fmla z31.h, z0.h, z2.h[4]\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[5]\n" + "fmla z12.h, z1.h, z6.h[5]\n" + "fmla z16.h, z1.h, z5.h[5]\n" + "fmla z20.h, z1.h, z4.h[5]\n" + "fmla z24.h, z1.h, z3.h[5]\n" + "fmla z28.h, z1.h, z2.h[5]\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[5]\n" + "fmla z13.h, z0.h, z6.h[5]\n" + "fmla z17.h, z0.h, z5.h[5]\n" + "fmla z21.h, z0.h, z4.h[5]\n" + "fmla z25.h, z0.h, z3.h[5]\n" + "fmla z29.h, z0.h, z2.h[5]\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z30.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "fmla z31.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z30.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "fmla z31.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z30.h, z6.h, z5.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" - "fmla z31.h, z7.h, z5.h[7]\n" + "fmla z10.h, z1.h, z7.h[5]\n" + "fmla z14.h, z1.h, z6.h[5]\n" + "fmla z18.h, z1.h, z5.h[5]\n" + "fmla z22.h, z1.h, z4.h[5]\n" + "fmla z26.h, z1.h, z3.h[5]\n" + "fmla z30.h, z1.h, z2.h[5]\n" + "ld1h { z1.h }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[5]\n" + "fmla z15.h, z0.h, z6.h[5]\n" + "fmla z19.h, z0.h, z5.h[5]\n" + "fmla z23.h, z0.h, z4.h[5]\n" + "fmla z27.h, z0.h, z3.h[5]\n" + "fmla z31.h, z0.h, z2.h[5]\n" + "ld1h { z0.h }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[6]\n" + "fmla z12.h, z1.h, z6.h[6]\n" + "fmla z16.h, z1.h, z5.h[6]\n" + "fmla z20.h, z1.h, z4.h[6]\n" + "fmla z24.h, z1.h, z3.h[6]\n" + "fmla z28.h, z1.h, z2.h[6]\n" + "ld1h { z1.h }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[6]\n" + "fmla z13.h, z0.h, z6.h[6]\n" + "fmla z17.h, z0.h, z5.h[6]\n" + "fmla z21.h, z0.h, z4.h[6]\n" + "fmla z25.h, z0.h, z3.h[6]\n" + "fmla z29.h, z0.h, z2.h[6]\n" + "ld1h { z0.h }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[6]\n" + "fmla z14.h, z1.h, z6.h[6]\n" + "fmla z18.h, z1.h, z5.h[6]\n" + "fmla z22.h, z1.h, z4.h[6]\n" + "fmla z26.h, z1.h, z3.h[6]\n" + "fmla z30.h, z1.h, z2.h[6]\n" + "ld1h { z1.h }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.h, z0.h, z7.h[6]\n" + "fmla z15.h, z0.h, z6.h[6]\n" + "fmla z19.h, z0.h, z5.h[6]\n" + "fmla z23.h, z0.h, z4.h[6]\n" + "fmla z27.h, z0.h, z3.h[6]\n" + "fmla z31.h, z0.h, z2.h[6]\n" + "ld1h { z0.h }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.h, z1.h, z7.h[7]\n" + "fmla z12.h, z1.h, z6.h[7]\n" + "fmla z16.h, z1.h, z5.h[7]\n" + "fmla z20.h, z1.h, z4.h[7]\n" + "fmla z24.h, z1.h, z3.h[7]\n" + "fmla z28.h, z1.h, z2.h[7]\n" + "ld1h { z1.h }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.h, z0.h, z7.h[7]\n" + "fmla z13.h, z0.h, z6.h[7]\n" + "fmla z17.h, z0.h, z5.h[7]\n" + "fmla z21.h, z0.h, z4.h[7]\n" + "fmla z25.h, z0.h, z3.h[7]\n" + "fmla z29.h, z0.h, z2.h[7]\n" + "ld1h { z0.h }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.h, z1.h, z7.h[7]\n" + "fmla z14.h, z1.h, z6.h[7]\n" + "fmla z18.h, z1.h, z5.h[7]\n" + "fmla z22.h, z1.h, z4.h[7]\n" + "fmla z26.h, z1.h, z3.h[7]\n" + "fmla z30.h, z1.h, z2.h[7]\n" + "fmla z11.h, z0.h, z7.h[7]\n" + "fmla z15.h, z0.h, z6.h[7]\n" + "fmla z19.h, z0.h, z5.h[7]\n" + "fmla z23.h, z0.h, z4.h[7]\n" + "fmla z27.h, z0.h, z3.h[7]\n" + "fmla z31.h, z0.h, z2.h[7]\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.h, XZR, x27\n" @@ -2672,251 +2672,251 @@ void sve_hybrid_fp16_mla_6x4VL ( "ld1rqh { z3.h }, p0/Z, [x23]\n" "ld1rqh { z4.h }, p0/Z, [x22]\n" "ld1rqh { z5.h }, p0/Z, [x21]\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z1.h[0]\n" - "fmla z16.h, z6.h, z2.h[0]\n" - "fmla z20.h, z6.h, z3.h[0]\n" - "fmla z24.h, z6.h, z4.h[0]\n" - "fmla z28.h, z6.h, z5.h[0]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z1.h[0]\n" - "fmla z17.h, z7.h, z2.h[0]\n" - "fmla z21.h, z7.h, z3.h[0]\n" - "fmla z25.h, z7.h, z4.h[0]\n" - "fmla z29.h, z7.h, z5.h[0]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[0]\n" + "fmla z12.h, z7.h, z1.h[0]\n" + "fmla z16.h, z7.h, z2.h[0]\n" + "fmla z20.h, z7.h, z3.h[0]\n" + "fmla z24.h, z7.h, z4.h[0]\n" + "fmla z28.h, z7.h, z5.h[0]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[0]\n" + "fmla z13.h, z6.h, z1.h[0]\n" + "fmla z17.h, z6.h, z2.h[0]\n" + "fmla z21.h, z6.h, z3.h[0]\n" + "fmla z25.h, z6.h, z4.h[0]\n" + "fmla z29.h, z6.h, z5.h[0]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[0]\n" - "fmla z14.h, z6.h, z1.h[0]\n" - "fmla z18.h, z6.h, z2.h[0]\n" - "fmla z22.h, z6.h, z3.h[0]\n" - "fmla z26.h, z6.h, z4.h[0]\n" - "fmla z30.h, z6.h, z5.h[0]\n" - "fmla z11.h, z7.h, z0.h[0]\n" - "fmla z15.h, z7.h, z1.h[0]\n" - "fmla z19.h, z7.h, z2.h[0]\n" - "fmla z23.h, z7.h, z3.h[0]\n" - "fmla z27.h, z7.h, z4.h[0]\n" - "fmla z31.h, z7.h, z5.h[0]\n" + "fmla z10.h, z7.h, z0.h[0]\n" + "fmla z14.h, z7.h, z1.h[0]\n" + "fmla z18.h, z7.h, z2.h[0]\n" + "fmla z22.h, z7.h, z3.h[0]\n" + "fmla z26.h, z7.h, z4.h[0]\n" + "fmla z30.h, z7.h, z5.h[0]\n" + "fmla z11.h, z6.h, z0.h[0]\n" + "fmla z15.h, z6.h, z1.h[0]\n" + "fmla z19.h, z6.h, z2.h[0]\n" + "fmla z23.h, z6.h, z3.h[0]\n" + "fmla z27.h, z6.h, z4.h[0]\n" + "fmla z31.h, z6.h, z5.h[0]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[1]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z16.h, z6.h, z2.h[1]\n" - "fmla z20.h, z6.h, z3.h[1]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[1]\n" + "fmla z12.h, z7.h, z1.h[1]\n" + "fmla z16.h, z7.h, z2.h[1]\n" + "fmla z20.h, z7.h, z3.h[1]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[1]\n" - "fmla z28.h, z6.h, z5.h[1]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[1]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z17.h, z7.h, z2.h[1]\n" - "fmla z21.h, z7.h, z3.h[1]\n" - "fmla z25.h, z7.h, z4.h[1]\n" - "fmla z29.h, z7.h, z5.h[1]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[1]\n" + "fmla z28.h, z7.h, z5.h[1]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[1]\n" + "fmla z13.h, z6.h, z1.h[1]\n" + "fmla z17.h, z6.h, z2.h[1]\n" + "fmla z21.h, z6.h, z3.h[1]\n" + "fmla z25.h, z6.h, z4.h[1]\n" + "fmla z29.h, z6.h, z5.h[1]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[1]\n" - "fmla z14.h, z6.h, z1.h[1]\n" - "fmla z18.h, z6.h, z2.h[1]\n" - "fmla z22.h, z6.h, z3.h[1]\n" - "fmla z26.h, z6.h, z4.h[1]\n" - "fmla z30.h, z6.h, z5.h[1]\n" - "fmla z11.h, z7.h, z0.h[1]\n" - "fmla z15.h, z7.h, z1.h[1]\n" - "fmla z19.h, z7.h, z2.h[1]\n" - "fmla z23.h, z7.h, z3.h[1]\n" - "fmla z27.h, z7.h, z4.h[1]\n" - "fmla z31.h, z7.h, z5.h[1]\n" + "fmla z10.h, z7.h, z0.h[1]\n" + "fmla z14.h, z7.h, z1.h[1]\n" + "fmla z18.h, z7.h, z2.h[1]\n" + "fmla z22.h, z7.h, z3.h[1]\n" + "fmla z26.h, z7.h, z4.h[1]\n" + "fmla z30.h, z7.h, z5.h[1]\n" + "fmla z11.h, z6.h, z0.h[1]\n" + "fmla z15.h, z6.h, z1.h[1]\n" + "fmla z19.h, z6.h, z2.h[1]\n" + "fmla z23.h, z6.h, z3.h[1]\n" + "fmla z27.h, z6.h, z4.h[1]\n" + "fmla z31.h, z6.h, z5.h[1]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[2]\n" - "fmla z12.h, z6.h, z1.h[2]\n" - "fmla z16.h, z6.h, z2.h[2]\n" - "fmla z20.h, z6.h, z3.h[2]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[2]\n" + "fmla z12.h, z7.h, z1.h[2]\n" + "fmla z16.h, z7.h, z2.h[2]\n" + "fmla z20.h, z7.h, z3.h[2]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[2]\n" - "fmla z28.h, z6.h, z5.h[2]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[2]\n" - "fmla z13.h, z7.h, z1.h[2]\n" - "fmla z17.h, z7.h, z2.h[2]\n" - "fmla z21.h, z7.h, z3.h[2]\n" - "fmla z25.h, z7.h, z4.h[2]\n" - "fmla z29.h, z7.h, z5.h[2]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[2]\n" + "fmla z28.h, z7.h, z5.h[2]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[2]\n" + "fmla z13.h, z6.h, z1.h[2]\n" + "fmla z17.h, z6.h, z2.h[2]\n" + "fmla z21.h, z6.h, z3.h[2]\n" + "fmla z25.h, z6.h, z4.h[2]\n" + "fmla z29.h, z6.h, z5.h[2]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[2]\n" - "fmla z14.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z2.h[2]\n" - "fmla z22.h, z6.h, z3.h[2]\n" - "fmla z26.h, z6.h, z4.h[2]\n" - "fmla z30.h, z6.h, z5.h[2]\n" - "fmla z11.h, z7.h, z0.h[2]\n" - "fmla z15.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z2.h[2]\n" - "fmla z23.h, z7.h, z3.h[2]\n" - "fmla z27.h, z7.h, z4.h[2]\n" - "fmla z31.h, z7.h, z5.h[2]\n" + "fmla z10.h, z7.h, z0.h[2]\n" + "fmla z14.h, z7.h, z1.h[2]\n" + "fmla z18.h, z7.h, z2.h[2]\n" + "fmla z22.h, z7.h, z3.h[2]\n" + "fmla z26.h, z7.h, z4.h[2]\n" + "fmla z30.h, z7.h, z5.h[2]\n" + "fmla z11.h, z6.h, z0.h[2]\n" + "fmla z15.h, z6.h, z1.h[2]\n" + "fmla z19.h, z6.h, z2.h[2]\n" + "fmla z23.h, z6.h, z3.h[2]\n" + "fmla z27.h, z6.h, z4.h[2]\n" + "fmla z31.h, z6.h, z5.h[2]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[3]\n" - "fmla z12.h, z6.h, z1.h[3]\n" - "fmla z16.h, z6.h, z2.h[3]\n" - "fmla z20.h, z6.h, z3.h[3]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[3]\n" + "fmla z12.h, z7.h, z1.h[3]\n" + "fmla z16.h, z7.h, z2.h[3]\n" + "fmla z20.h, z7.h, z3.h[3]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[3]\n" - "fmla z28.h, z6.h, z5.h[3]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[3]\n" - "fmla z13.h, z7.h, z1.h[3]\n" - "fmla z17.h, z7.h, z2.h[3]\n" - "fmla z21.h, z7.h, z3.h[3]\n" - "fmla z25.h, z7.h, z4.h[3]\n" - "fmla z29.h, z7.h, z5.h[3]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[3]\n" + "fmla z28.h, z7.h, z5.h[3]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[3]\n" + "fmla z13.h, z6.h, z1.h[3]\n" + "fmla z17.h, z6.h, z2.h[3]\n" + "fmla z21.h, z6.h, z3.h[3]\n" + "fmla z25.h, z6.h, z4.h[3]\n" + "fmla z29.h, z6.h, z5.h[3]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[3]\n" - "fmla z14.h, z6.h, z1.h[3]\n" - "fmla z18.h, z6.h, z2.h[3]\n" - "fmla z22.h, z6.h, z3.h[3]\n" - "fmla z26.h, z6.h, z4.h[3]\n" - "fmla z30.h, z6.h, z5.h[3]\n" - "fmla z11.h, z7.h, z0.h[3]\n" - "fmla z15.h, z7.h, z1.h[3]\n" - "fmla z19.h, z7.h, z2.h[3]\n" - "fmla z23.h, z7.h, z3.h[3]\n" - "fmla z27.h, z7.h, z4.h[3]\n" - "fmla z31.h, z7.h, z5.h[3]\n" + "fmla z10.h, z7.h, z0.h[3]\n" + "fmla z14.h, z7.h, z1.h[3]\n" + "fmla z18.h, z7.h, z2.h[3]\n" + "fmla z22.h, z7.h, z3.h[3]\n" + "fmla z26.h, z7.h, z4.h[3]\n" + "fmla z30.h, z7.h, z5.h[3]\n" + "fmla z11.h, z6.h, z0.h[3]\n" + "fmla z15.h, z6.h, z1.h[3]\n" + "fmla z19.h, z6.h, z2.h[3]\n" + "fmla z23.h, z6.h, z3.h[3]\n" + "fmla z27.h, z6.h, z4.h[3]\n" + "fmla z31.h, z6.h, z5.h[3]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[4]\n" - "fmla z12.h, z6.h, z1.h[4]\n" - "fmla z16.h, z6.h, z2.h[4]\n" - "fmla z20.h, z6.h, z3.h[4]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[4]\n" + "fmla z12.h, z7.h, z1.h[4]\n" + "fmla z16.h, z7.h, z2.h[4]\n" + "fmla z20.h, z7.h, z3.h[4]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[4]\n" - "fmla z28.h, z6.h, z5.h[4]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[4]\n" - "fmla z13.h, z7.h, z1.h[4]\n" - "fmla z17.h, z7.h, z2.h[4]\n" - "fmla z21.h, z7.h, z3.h[4]\n" - "fmla z25.h, z7.h, z4.h[4]\n" - "fmla z29.h, z7.h, z5.h[4]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[4]\n" + "fmla z28.h, z7.h, z5.h[4]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[4]\n" + "fmla z13.h, z6.h, z1.h[4]\n" + "fmla z17.h, z6.h, z2.h[4]\n" + "fmla z21.h, z6.h, z3.h[4]\n" + "fmla z25.h, z6.h, z4.h[4]\n" + "fmla z29.h, z6.h, z5.h[4]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[4]\n" - "fmla z14.h, z6.h, z1.h[4]\n" - "fmla z18.h, z6.h, z2.h[4]\n" - "fmla z22.h, z6.h, z3.h[4]\n" - "fmla z26.h, z6.h, z4.h[4]\n" - "fmla z30.h, z6.h, z5.h[4]\n" - "fmla z11.h, z7.h, z0.h[4]\n" - "fmla z15.h, z7.h, z1.h[4]\n" - "fmla z19.h, z7.h, z2.h[4]\n" - "fmla z23.h, z7.h, z3.h[4]\n" - "fmla z27.h, z7.h, z4.h[4]\n" - "fmla z31.h, z7.h, z5.h[4]\n" + "fmla z10.h, z7.h, z0.h[4]\n" + "fmla z14.h, z7.h, z1.h[4]\n" + "fmla z18.h, z7.h, z2.h[4]\n" + "fmla z22.h, z7.h, z3.h[4]\n" + "fmla z26.h, z7.h, z4.h[4]\n" + "fmla z30.h, z7.h, z5.h[4]\n" + "fmla z11.h, z6.h, z0.h[4]\n" + "fmla z15.h, z6.h, z1.h[4]\n" + "fmla z19.h, z6.h, z2.h[4]\n" + "fmla z23.h, z6.h, z3.h[4]\n" + "fmla z27.h, z6.h, z4.h[4]\n" + "fmla z31.h, z6.h, z5.h[4]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[5]\n" - "fmla z12.h, z6.h, z1.h[5]\n" - "fmla z16.h, z6.h, z2.h[5]\n" - "fmla z20.h, z6.h, z3.h[5]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[5]\n" + "fmla z12.h, z7.h, z1.h[5]\n" + "fmla z16.h, z7.h, z2.h[5]\n" + "fmla z20.h, z7.h, z3.h[5]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[5]\n" - "fmla z28.h, z6.h, z5.h[5]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[5]\n" - "fmla z13.h, z7.h, z1.h[5]\n" - "fmla z17.h, z7.h, z2.h[5]\n" - "fmla z21.h, z7.h, z3.h[5]\n" - "fmla z25.h, z7.h, z4.h[5]\n" - "fmla z29.h, z7.h, z5.h[5]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[5]\n" + "fmla z28.h, z7.h, z5.h[5]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[5]\n" + "fmla z13.h, z6.h, z1.h[5]\n" + "fmla z17.h, z6.h, z2.h[5]\n" + "fmla z21.h, z6.h, z3.h[5]\n" + "fmla z25.h, z6.h, z4.h[5]\n" + "fmla z29.h, z6.h, z5.h[5]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[5]\n" - "fmla z14.h, z6.h, z1.h[5]\n" - "fmla z18.h, z6.h, z2.h[5]\n" - "fmla z22.h, z6.h, z3.h[5]\n" - "fmla z26.h, z6.h, z4.h[5]\n" - "fmla z30.h, z6.h, z5.h[5]\n" - "fmla z11.h, z7.h, z0.h[5]\n" - "fmla z15.h, z7.h, z1.h[5]\n" - "fmla z19.h, z7.h, z2.h[5]\n" - "fmla z23.h, z7.h, z3.h[5]\n" - "fmla z27.h, z7.h, z4.h[5]\n" - "fmla z31.h, z7.h, z5.h[5]\n" + "fmla z10.h, z7.h, z0.h[5]\n" + "fmla z14.h, z7.h, z1.h[5]\n" + "fmla z18.h, z7.h, z2.h[5]\n" + "fmla z22.h, z7.h, z3.h[5]\n" + "fmla z26.h, z7.h, z4.h[5]\n" + "fmla z30.h, z7.h, z5.h[5]\n" + "fmla z11.h, z6.h, z0.h[5]\n" + "fmla z15.h, z6.h, z1.h[5]\n" + "fmla z19.h, z6.h, z2.h[5]\n" + "fmla z23.h, z6.h, z3.h[5]\n" + "fmla z27.h, z6.h, z4.h[5]\n" + "fmla z31.h, z6.h, z5.h[5]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[6]\n" - "fmla z12.h, z6.h, z1.h[6]\n" - "fmla z16.h, z6.h, z2.h[6]\n" - "fmla z20.h, z6.h, z3.h[6]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[6]\n" + "fmla z12.h, z7.h, z1.h[6]\n" + "fmla z16.h, z7.h, z2.h[6]\n" + "fmla z20.h, z7.h, z3.h[6]\n" "subs x27, x27, #0x1\n" - "fmla z24.h, z6.h, z4.h[6]\n" - "fmla z28.h, z6.h, z5.h[6]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[6]\n" - "fmla z13.h, z7.h, z1.h[6]\n" - "fmla z17.h, z7.h, z2.h[6]\n" - "fmla z21.h, z7.h, z3.h[6]\n" - "fmla z25.h, z7.h, z4.h[6]\n" - "fmla z29.h, z7.h, z5.h[6]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.h, z7.h, z4.h[6]\n" + "fmla z28.h, z7.h, z5.h[6]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[6]\n" + "fmla z13.h, z6.h, z1.h[6]\n" + "fmla z17.h, z6.h, z2.h[6]\n" + "fmla z21.h, z6.h, z3.h[6]\n" + "fmla z25.h, z6.h, z4.h[6]\n" + "fmla z29.h, z6.h, z5.h[6]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[6]\n" - "fmla z14.h, z6.h, z1.h[6]\n" - "fmla z18.h, z6.h, z2.h[6]\n" - "fmla z22.h, z6.h, z3.h[6]\n" - "fmla z26.h, z6.h, z4.h[6]\n" - "fmla z30.h, z6.h, z5.h[6]\n" - "fmla z11.h, z7.h, z0.h[6]\n" - "fmla z15.h, z7.h, z1.h[6]\n" - "fmla z19.h, z7.h, z2.h[6]\n" - "fmla z23.h, z7.h, z3.h[6]\n" - "fmla z27.h, z7.h, z4.h[6]\n" - "fmla z31.h, z7.h, z5.h[6]\n" + "fmla z10.h, z7.h, z0.h[6]\n" + "fmla z14.h, z7.h, z1.h[6]\n" + "fmla z18.h, z7.h, z2.h[6]\n" + "fmla z22.h, z7.h, z3.h[6]\n" + "fmla z26.h, z7.h, z4.h[6]\n" + "fmla z30.h, z7.h, z5.h[6]\n" + "fmla z11.h, z6.h, z0.h[6]\n" + "fmla z15.h, z6.h, z1.h[6]\n" + "fmla z19.h, z6.h, z2.h[6]\n" + "fmla z23.h, z6.h, z3.h[6]\n" + "fmla z27.h, z6.h, z4.h[6]\n" + "fmla z31.h, z6.h, z5.h[6]\n" "ble 76f\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.h, z6.h, z0.h[7]\n" - "fmla z12.h, z6.h, z1.h[7]\n" - "fmla z16.h, z6.h, z2.h[7]\n" - "fmla z20.h, z6.h, z3.h[7]\n" - "fmla z24.h, z6.h, z4.h[7]\n" - "fmla z28.h, z6.h, z5.h[7]\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.h, z7.h, z0.h[7]\n" - "fmla z13.h, z7.h, z1.h[7]\n" - "fmla z17.h, z7.h, z2.h[7]\n" - "fmla z21.h, z7.h, z3.h[7]\n" - "fmla z25.h, z7.h, z4.h[7]\n" - "fmla z29.h, z7.h, z5.h[7]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + "ld1h { z7.h }, p5/Z, [x10]\n" + "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.h, z7.h, z0.h[7]\n" + "fmla z12.h, z7.h, z1.h[7]\n" + "fmla z16.h, z7.h, z2.h[7]\n" + "fmla z20.h, z7.h, z3.h[7]\n" + "fmla z24.h, z7.h, z4.h[7]\n" + "fmla z28.h, z7.h, z5.h[7]\n" + "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.h, z6.h, z0.h[7]\n" + "fmla z13.h, z6.h, z1.h[7]\n" + "fmla z17.h, z6.h, z2.h[7]\n" + "fmla z21.h, z6.h, z3.h[7]\n" + "fmla z25.h, z6.h, z4.h[7]\n" + "fmla z29.h, z6.h, z5.h[7]\n" + "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.h, z6.h, z0.h[7]\n" - "fmla z14.h, z6.h, z1.h[7]\n" - "fmla z18.h, z6.h, z2.h[7]\n" - "fmla z22.h, z6.h, z3.h[7]\n" - "fmla z26.h, z6.h, z4.h[7]\n" - "fmla z30.h, z6.h, z5.h[7]\n" - "fmla z11.h, z7.h, z0.h[7]\n" - "fmla z15.h, z7.h, z1.h[7]\n" - "fmla z19.h, z7.h, z2.h[7]\n" - "fmla z23.h, z7.h, z3.h[7]\n" - "fmla z27.h, z7.h, z4.h[7]\n" - "fmla z31.h, z7.h, z5.h[7]\n" + "fmla z10.h, z7.h, z0.h[7]\n" + "fmla z14.h, z7.h, z1.h[7]\n" + "fmla z18.h, z7.h, z2.h[7]\n" + "fmla z22.h, z7.h, z3.h[7]\n" + "fmla z26.h, z7.h, z4.h[7]\n" + "fmla z30.h, z7.h, z5.h[7]\n" + "fmla z11.h, z6.h, z0.h[7]\n" + "fmla z15.h, z6.h, z1.h[7]\n" + "fmla z19.h, z6.h, z2.h[7]\n" + "fmla z23.h, z6.h, z3.h[7]\n" + "fmla z27.h, z6.h, z4.h[7]\n" + "fmla z31.h, z6.h, z5.h[7]\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -3023,7 +3023,6 @@ void sve_hybrid_fp16_mla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -3031,4 +3030,4 @@ void sve_hybrid_fp16_mla_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp index b63b143d4c..880f9d1a27 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -75,13 +75,16 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { - case CPUModel::V1: - return { 15.65 }; default: return { 6.667 }; + case CPUModel::A510: + return { 5.41 }; + case CPUModel::V1: + return { 15.65 }; + case CPUModel::A64FX: + return { 25.55 }; } } @@ -105,5 +108,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp index 9ae51af59b..66481f04f9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp @@ -139,11 +139,11 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -159,12 +159,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "9:" // Height 1: Multiply loop: Main loop "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x26, x26, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1w { z6.s }, p4/Z, [x10]\n" @@ -174,27 +174,27 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" "addvl x10, x10, #4\n" "bne 6b\n" "tbz %x[flags], #1, 11f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z17.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z17.s\n" + "fmin z9.s, p4/M, z9.s, z17.s\n" + "fmin z10.s, p4/M, z10.s, z17.s\n" + "fmin z11.s, p4/M, z11.s, z17.s\n" + "fmax z8.s, p4/M, z8.s, z16.s\n" + "fmax z9.s, p4/M, z9.s, z16.s\n" + "fmax z10.s, p4/M, z10.s, z16.s\n" + "fmax z11.s, p4/M, z11.s, z16.s\n" "11:" // Height 1: No activation "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" @@ -234,15 +234,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "15:" // Height 2: no bias "tbz %x[flags], #0, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x20]\n" + "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n" "b 17f\n" "16:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -258,12 +258,12 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -271,7 +271,7 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "b 20f\n" "19:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "20:" // Height 2: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -282,18 +282,18 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "21:" // Height 2: Multiply loop: Main loop "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z12.s, p4/M, z6.s, z1.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n" "add x26, x26, #0x4\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "subs x27, x27, #0x1\n" "add x25, x25, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z14.s, p4/M, z17.s, z1.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" + "fmla z15.s, p4/M, z16.s, z1.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1w { z6.s }, p4/Z, [x10]\n" @@ -303,41 +303,41 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "fmla z8.s, p4/M, z6.s, z0.s\n" "fmla z12.s, p4/M, z6.s, z1.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x10, #2, MUL VL]\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" + "fmla z10.s, p4/M, z17.s, z0.s\n" + "fmla z14.s, p4/M, z17.s, z1.s\n" "addvl x10, x10, #4\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z11.s, p4/M, z16.s, z0.s\n" + "fmla z15.s, p4/M, z16.s, z1.s\n" "bne 18b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #2\n" "tbz %x[flags], #1, 23f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z17.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" + "ld1rw { z16.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z17.s\n" + "fmin z9.s, p4/M, z9.s, z17.s\n" + "fmin z10.s, p4/M, z10.s, z17.s\n" + "fmin z11.s, p4/M, z11.s, z17.s\n" + "fmin z12.s, p4/M, z12.s, z17.s\n" + "fmin z13.s, p4/M, z13.s, z17.s\n" + "fmin z14.s, p4/M, z14.s, z17.s\n" + "fmin z15.s, p4/M, z15.s, z17.s\n" + "fmax z8.s, p4/M, z8.s, z16.s\n" + "fmax z9.s, p4/M, z9.s, z16.s\n" + "fmax z10.s, p4/M, z10.s, z16.s\n" + "fmax z11.s, p4/M, z11.s, z16.s\n" + "fmax z12.s, p4/M, z12.s, z16.s\n" + "fmax z13.s, p4/M, z13.s, z16.s\n" + "fmax z14.s, p4/M, z14.s, z16.s\n" + "fmax z15.s, p4/M, z15.s, z16.s\n" "23:" // Height 2: No activation "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" @@ -385,20 +385,20 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "27:" // Height 3: no bias "tbz %x[flags], #0, 28f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x21]\n" + "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x20]\n" + "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n" "b 29f\n" "28:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -418,13 +418,13 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -433,8 +433,8 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "b 32f\n" "31:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "32:" // Height 3: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -450,21 +450,21 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x24, x24, #0x4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z21.s, z0.s\n" + "fmla z14.s, p4/M, z21.s, z1.s\n" + "fmla z18.s, p4/M, z21.s, z2.s\n" + "fmla z11.s, p4/M, z20.s, z0.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1w { z6.s }, p4/Z, [x10]\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z15.s, p4/M, z20.s, z1.s\n" + "fmla z19.s, p4/M, z20.s, z2.s\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n" @@ -476,51 +476,51 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z21.s }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z10.s, p4/M, z21.s, z0.s\n" + "fmla z14.s, p4/M, z21.s, z1.s\n" + "fmla z18.s, p4/M, z21.s, z2.s\n" + "fmla z11.s, p4/M, z20.s, z0.s\n" + "fmla z15.s, p4/M, z20.s, z1.s\n" + "fmla z19.s, p4/M, z20.s, z2.s\n" "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #2\n" "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 35f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z21.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" + "ld1rw { z20.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z21.s\n" + "fmin z9.s, p4/M, z9.s, z21.s\n" + "fmin z10.s, p4/M, z10.s, z21.s\n" + "fmin z11.s, p4/M, z11.s, z21.s\n" + "fmin z12.s, p4/M, z12.s, z21.s\n" + "fmin z13.s, p4/M, z13.s, z21.s\n" + "fmin z14.s, p4/M, z14.s, z21.s\n" + "fmin z15.s, p4/M, z15.s, z21.s\n" + "fmin z16.s, p4/M, z16.s, z21.s\n" + "fmin z17.s, p4/M, z17.s, z21.s\n" + "fmin z18.s, p4/M, z18.s, z21.s\n" + "fmin z19.s, p4/M, z19.s, z21.s\n" + "fmax z8.s, p4/M, z8.s, z20.s\n" + "fmax z9.s, p4/M, z9.s, z20.s\n" + "fmax z10.s, p4/M, z10.s, z20.s\n" + "fmax z11.s, p4/M, z11.s, z20.s\n" + "fmax z12.s, p4/M, z12.s, z20.s\n" + "fmax z13.s, p4/M, z13.s, z20.s\n" + "fmax z14.s, p4/M, z14.s, z20.s\n" + "fmax z15.s, p4/M, z15.s, z20.s\n" + "fmax z16.s, p4/M, z16.s, z20.s\n" + "fmax z17.s, p4/M, z17.s, z20.s\n" + "fmax z18.s, p4/M, z18.s, z20.s\n" + "fmax z19.s, p4/M, z19.s, z20.s\n" "35:" // Height 3: No activation "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" @@ -576,25 +576,25 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "39:" // Height 4: no bias "tbz %x[flags], #0, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x22]\n" + "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x21]\n" + "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x20]\n" + "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n" "b 41f\n" "40:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -618,14 +618,14 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "42:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 43f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 44f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -635,9 +635,9 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "b 44f\n" "43:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "44:" // Height 4: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -654,7 +654,7 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "subs x27, x27, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z20.s, p4/M, z6.s, z3.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" @@ -662,19 +662,19 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x23, x23, #0x4\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "fmla z21.s, p4/M, z7.s, z3.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" + "fmla z10.s, p4/M, z25.s, z0.s\n" + "fmla z14.s, p4/M, z25.s, z1.s\n" + "fmla z18.s, p4/M, z25.s, z2.s\n" + "fmla z22.s, p4/M, z25.s, z3.s\n" "ld1w { z6.s }, p4/Z, [x10]\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" + "fmla z11.s, p4/M, z24.s, z0.s\n" + "fmla z15.s, p4/M, z24.s, z1.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z19.s, p4/M, z24.s, z2.s\n" + "fmla z23.s, p4/M, z24.s, z3.s\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n" @@ -686,22 +686,22 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x28, x28, #0x1\n" "fmla z16.s, p4/M, z6.s, z2.s\n" "fmla z20.s, p4/M, z6.s, z3.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "fmla z9.s, p4/M, z7.s, z0.s\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "fmla z21.s, p4/M, z7.s, z3.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" + "fmla z10.s, p4/M, z25.s, z0.s\n" + "fmla z14.s, p4/M, z25.s, z1.s\n" + "fmla z18.s, p4/M, z25.s, z2.s\n" + "fmla z22.s, p4/M, z25.s, z3.s\n" + "fmla z11.s, p4/M, z24.s, z0.s\n" + "fmla z15.s, p4/M, z24.s, z1.s\n" + "fmla z19.s, p4/M, z24.s, z2.s\n" + "fmla z23.s, p4/M, z24.s, z3.s\n" "bne 42b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #2\n" @@ -709,41 +709,41 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 47f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z25.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmin z20.s, p4/M, z20.s, z1.s\n" - "fmin z21.s, p4/M, z21.s, z1.s\n" - "fmin z22.s, p4/M, z22.s, z1.s\n" - "fmin z23.s, p4/M, z23.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" - "fmax z20.s, p4/M, z20.s, z0.s\n" - "fmax z21.s, p4/M, z21.s, z0.s\n" - "fmax z22.s, p4/M, z22.s, z0.s\n" - "fmax z23.s, p4/M, z23.s, z0.s\n" + "ld1rw { z24.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z25.s\n" + "fmin z9.s, p4/M, z9.s, z25.s\n" + "fmin z10.s, p4/M, z10.s, z25.s\n" + "fmin z11.s, p4/M, z11.s, z25.s\n" + "fmin z12.s, p4/M, z12.s, z25.s\n" + "fmin z13.s, p4/M, z13.s, z25.s\n" + "fmin z14.s, p4/M, z14.s, z25.s\n" + "fmin z15.s, p4/M, z15.s, z25.s\n" + "fmin z16.s, p4/M, z16.s, z25.s\n" + "fmin z17.s, p4/M, z17.s, z25.s\n" + "fmin z18.s, p4/M, z18.s, z25.s\n" + "fmin z19.s, p4/M, z19.s, z25.s\n" + "fmin z20.s, p4/M, z20.s, z25.s\n" + "fmin z21.s, p4/M, z21.s, z25.s\n" + "fmin z22.s, p4/M, z22.s, z25.s\n" + "fmin z23.s, p4/M, z23.s, z25.s\n" + "fmax z8.s, p4/M, z8.s, z24.s\n" + "fmax z9.s, p4/M, z9.s, z24.s\n" + "fmax z10.s, p4/M, z10.s, z24.s\n" + "fmax z11.s, p4/M, z11.s, z24.s\n" + "fmax z12.s, p4/M, z12.s, z24.s\n" + "fmax z13.s, p4/M, z13.s, z24.s\n" + "fmax z14.s, p4/M, z14.s, z24.s\n" + "fmax z15.s, p4/M, z15.s, z24.s\n" + "fmax z16.s, p4/M, z16.s, z24.s\n" + "fmax z17.s, p4/M, z17.s, z24.s\n" + "fmax z18.s, p4/M, z18.s, z24.s\n" + "fmax z19.s, p4/M, z19.s, z24.s\n" + "fmax z20.s, p4/M, z20.s, z24.s\n" + "fmax z21.s, p4/M, z21.s, z24.s\n" + "fmax z22.s, p4/M, z22.s, z24.s\n" + "fmax z23.s, p4/M, z23.s, z24.s\n" "47:" // Height 4: No activation "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" @@ -807,30 +807,30 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "51:" // Height 5: no bias "tbz %x[flags], #0, 52f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x9]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" "b 53f\n" "52:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -858,15 +858,15 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "54:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 55f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -877,10 +877,10 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "b 56f\n" "55:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "56:" // Height 5: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -902,29 +902,29 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x24, x24, #0x4\n" "fmla z24.s, p4/M, z6.s, z4.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n" "add x23, x23, #0x4\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "add x22, x22, #0x4\n" "fmla z21.s, p4/M, z7.s, z3.s\n" "fmla z25.s, p4/M, z7.s, z4.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z26.s, p4/M, z6.s, z4.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" + "fmla z10.s, p4/M, z29.s, z0.s\n" + "fmla z14.s, p4/M, z29.s, z1.s\n" + "fmla z18.s, p4/M, z29.s, z2.s\n" + "fmla z22.s, p4/M, z29.s, z3.s\n" + "fmla z26.s, p4/M, z29.s, z4.s\n" + "fmla z11.s, p4/M, z28.s, z0.s\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1w { z6.s }, p4/Z, [x10]\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" + "fmla z15.s, p4/M, z28.s, z1.s\n" + "fmla z19.s, p4/M, z28.s, z2.s\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" - "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z23.s, p4/M, z28.s, z3.s\n" + "fmla z27.s, p4/M, z28.s, z4.s\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1rw { z4.s }, p4/Z, [x22]\n" "ld1w { z7.s }, p4/Z, [x10, #1, MUL VL]\n" @@ -939,23 +939,23 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "cmp x28, x20\n" "fmla z24.s, p4/M, z6.s, z4.s\n" "fmla z9.s, p4/M, z7.s, z0.s\n" - "ld1w { z6.s }, p4/Z, [x10, #2, MUL VL]\n" + "ld1w { z29.s }, p4/Z, [x10, #2, MUL VL]\n" "fmla z13.s, p4/M, z7.s, z1.s\n" "fmla z17.s, p4/M, z7.s, z2.s\n" "fmla z21.s, p4/M, z7.s, z3.s\n" "fmla z25.s, p4/M, z7.s, z4.s\n" - "ld1w { z7.s }, p4/Z, [x10, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, p4/M, z6.s, z0.s\n" - "fmla z14.s, p4/M, z6.s, z1.s\n" - "fmla z18.s, p4/M, z6.s, z2.s\n" - "fmla z22.s, p4/M, z6.s, z3.s\n" - "fmla z26.s, p4/M, z6.s, z4.s\n" - "fmla z11.s, p4/M, z7.s, z0.s\n" - "fmla z15.s, p4/M, z7.s, z1.s\n" - "fmla z19.s, p4/M, z7.s, z2.s\n" - "fmla z23.s, p4/M, z7.s, z3.s\n" - "fmla z27.s, p4/M, z7.s, z4.s\n" + "fmla z10.s, p4/M, z29.s, z0.s\n" + "fmla z14.s, p4/M, z29.s, z1.s\n" + "fmla z18.s, p4/M, z29.s, z2.s\n" + "fmla z22.s, p4/M, z29.s, z3.s\n" + "fmla z26.s, p4/M, z29.s, z4.s\n" + "fmla z11.s, p4/M, z28.s, z0.s\n" + "fmla z15.s, p4/M, z28.s, z1.s\n" + "fmla z19.s, p4/M, z28.s, z2.s\n" + "fmla z23.s, p4/M, z28.s, z3.s\n" + "fmla z27.s, p4/M, z28.s, z4.s\n" "bne 54b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x25, x9, x20, LSL #2\n" @@ -964,49 +964,49 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 59f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p4/Z, [x20]\n" + "ld1rw { z29.s }, p4/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p4/Z, [x20]\n" - "fmin z8.s, p4/M, z8.s, z1.s\n" - "fmin z9.s, p4/M, z9.s, z1.s\n" - "fmin z10.s, p4/M, z10.s, z1.s\n" - "fmin z11.s, p4/M, z11.s, z1.s\n" - "fmin z12.s, p4/M, z12.s, z1.s\n" - "fmin z13.s, p4/M, z13.s, z1.s\n" - "fmin z14.s, p4/M, z14.s, z1.s\n" - "fmin z15.s, p4/M, z15.s, z1.s\n" - "fmin z16.s, p4/M, z16.s, z1.s\n" - "fmin z17.s, p4/M, z17.s, z1.s\n" - "fmin z18.s, p4/M, z18.s, z1.s\n" - "fmin z19.s, p4/M, z19.s, z1.s\n" - "fmin z20.s, p4/M, z20.s, z1.s\n" - "fmin z21.s, p4/M, z21.s, z1.s\n" - "fmin z22.s, p4/M, z22.s, z1.s\n" - "fmin z23.s, p4/M, z23.s, z1.s\n" - "fmin z24.s, p4/M, z24.s, z1.s\n" - "fmin z25.s, p4/M, z25.s, z1.s\n" - "fmin z26.s, p4/M, z26.s, z1.s\n" - "fmin z27.s, p4/M, z27.s, z1.s\n" - "fmax z8.s, p4/M, z8.s, z0.s\n" - "fmax z9.s, p4/M, z9.s, z0.s\n" - "fmax z10.s, p4/M, z10.s, z0.s\n" - "fmax z11.s, p4/M, z11.s, z0.s\n" - "fmax z12.s, p4/M, z12.s, z0.s\n" - "fmax z13.s, p4/M, z13.s, z0.s\n" - "fmax z14.s, p4/M, z14.s, z0.s\n" - "fmax z15.s, p4/M, z15.s, z0.s\n" - "fmax z16.s, p4/M, z16.s, z0.s\n" - "fmax z17.s, p4/M, z17.s, z0.s\n" - "fmax z18.s, p4/M, z18.s, z0.s\n" - "fmax z19.s, p4/M, z19.s, z0.s\n" - "fmax z20.s, p4/M, z20.s, z0.s\n" - "fmax z21.s, p4/M, z21.s, z0.s\n" - "fmax z22.s, p4/M, z22.s, z0.s\n" - "fmax z23.s, p4/M, z23.s, z0.s\n" - "fmax z24.s, p4/M, z24.s, z0.s\n" - "fmax z25.s, p4/M, z25.s, z0.s\n" - "fmax z26.s, p4/M, z26.s, z0.s\n" - "fmax z27.s, p4/M, z27.s, z0.s\n" + "ld1rw { z28.s }, p4/Z, [x20]\n" + "fmin z8.s, p4/M, z8.s, z29.s\n" + "fmin z9.s, p4/M, z9.s, z29.s\n" + "fmin z10.s, p4/M, z10.s, z29.s\n" + "fmin z11.s, p4/M, z11.s, z29.s\n" + "fmin z12.s, p4/M, z12.s, z29.s\n" + "fmin z13.s, p4/M, z13.s, z29.s\n" + "fmin z14.s, p4/M, z14.s, z29.s\n" + "fmin z15.s, p4/M, z15.s, z29.s\n" + "fmin z16.s, p4/M, z16.s, z29.s\n" + "fmin z17.s, p4/M, z17.s, z29.s\n" + "fmin z18.s, p4/M, z18.s, z29.s\n" + "fmin z19.s, p4/M, z19.s, z29.s\n" + "fmin z20.s, p4/M, z20.s, z29.s\n" + "fmin z21.s, p4/M, z21.s, z29.s\n" + "fmin z22.s, p4/M, z22.s, z29.s\n" + "fmin z23.s, p4/M, z23.s, z29.s\n" + "fmin z24.s, p4/M, z24.s, z29.s\n" + "fmin z25.s, p4/M, z25.s, z29.s\n" + "fmin z26.s, p4/M, z26.s, z29.s\n" + "fmin z27.s, p4/M, z27.s, z29.s\n" + "fmax z8.s, p4/M, z8.s, z28.s\n" + "fmax z9.s, p4/M, z9.s, z28.s\n" + "fmax z10.s, p4/M, z10.s, z28.s\n" + "fmax z11.s, p4/M, z11.s, z28.s\n" + "fmax z12.s, p4/M, z12.s, z28.s\n" + "fmax z13.s, p4/M, z13.s, z28.s\n" + "fmax z14.s, p4/M, z14.s, z28.s\n" + "fmax z15.s, p4/M, z15.s, z28.s\n" + "fmax z16.s, p4/M, z16.s, z28.s\n" + "fmax z17.s, p4/M, z17.s, z28.s\n" + "fmax z18.s, p4/M, z18.s, z28.s\n" + "fmax z19.s, p4/M, z19.s, z28.s\n" + "fmax z20.s, p4/M, z20.s, z28.s\n" + "fmax z21.s, p4/M, z21.s, z28.s\n" + "fmax z22.s, p4/M, z22.s, z28.s\n" + "fmax z23.s, p4/M, z23.s, z28.s\n" + "fmax z24.s, p4/M, z24.s, z28.s\n" + "fmax z25.s, p4/M, z25.s, z28.s\n" + "fmax z26.s, p4/M, z26.s, z28.s\n" + "fmax z27.s, p4/M, z27.s, z28.s\n" "59:" // Height 5: No activation "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" @@ -1081,35 +1081,35 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "63:" // Height 6: no bias "tbz %x[flags], #0, 64f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x9]\n" + "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" - "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x25]\n" - "ld1w { z13.s }, p2/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x24]\n" - "ld1w { z17.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x23]\n" - "ld1w { z21.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x22]\n" - "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p3/Z, [x21]\n" - "ld1w { z29.s }, p2/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p1/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x24]\n" + "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x23]\n" + "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x22]\n" + "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x21]\n" + "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p3/Z, [x20]\n" + "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n" "b 65f\n" "64:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1141,16 +1141,16 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "66:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 67f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1162,11 +1162,11 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "b 68f\n" "67:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "68:" // Height 6: input setup done "subs x27, x27, #0x1\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -1355,7 +1355,6 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "74:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1363,4 +1362,4 @@ void sve_hybrid_fp32_mla_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp index 71c6afba42..e1581f2026 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp @@ -139,11 +139,11 @@ void sve_hybrid_fp32_mla_6x4VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -156,87 +156,87 @@ void sve_hybrid_fp32_mla_6x4VL ( "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10]\n" + "fmla z8.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z10.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z11.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z8.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z10.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z11.s, z16.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[2]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z17.s, z0.s[2]\n" + "fmla z11.s, z16.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[3]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" "add x26, x26, #0x10\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10]\n" + "fmla z8.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z17.s, z0.s[0]\n" + "fmla z11.s, z16.s, z0.s[0]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[1]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" + "fmla z10.s, z17.s, z0.s[1]\n" + "fmla z11.s, z16.s, z0.s[1]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[2]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" + "fmla z10.s, z17.s, z0.s[2]\n" + "fmla z11.s, z16.s, z0.s[2]\n" "addvl x10, x10, #4\n" "ble 11f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[3]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" "addvl x10, x10, #4\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -245,17 +245,17 @@ void sve_hybrid_fp32_mla_6x4VL ( "bne 6b\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -295,15 +295,15 @@ void sve_hybrid_fp32_mla_6x4VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" "b 18f\n" "17:" // Height 2: no accumulate "mov z8.b, #0x0\n" @@ -319,12 +319,12 @@ void sve_hybrid_fp32_mla_6x4VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -332,130 +332,130 @@ void sve_hybrid_fp32_mla_6x4VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "21:" // Height 2: input setup done "cmp x27, #0x4\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[0]\n" + "fmla z12.s, z17.s, z0.s[0]\n" + "fmla z9.s, z16.s, z1.s[0]\n" + "fmla z13.s, z16.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z17.s, z1.s[0]\n" + "fmla z14.s, z17.s, z0.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10, #4, MUL VL]\n" "cmp x27, #0x4\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[0]\n" + "fmla z15.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p5/Z, [x10, #5, MUL VL]\n" "add x26, x26, #0x10\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[1]\n" + "fmla z12.s, z17.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #6, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[1]\n" + "fmla z13.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z10.s, z17.s, z1.s[1]\n" + "fmla z14.s, z17.s, z0.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[1]\n" + "fmla z15.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[2]\n" + "fmla z12.s, z17.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[2]\n" + "fmla z13.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z17.s, z1.s[2]\n" + "fmla z14.s, z17.s, z0.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.s, z16.s, z1.s[2]\n" + "fmla z15.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z17.s, z1.s[3]\n" + "fmla z12.s, z17.s, z0.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.s, z16.s, z1.s[3]\n" + "fmla z13.s, z16.s, z0.s[3]\n" + "ld1w { z16.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.s, z17.s, z1.s[3]\n" + "fmla z14.s, z17.s, z0.s[3]\n" + "fmla z11.s, z16.s, z1.s[3]\n" + "fmla z15.s, z16.s, z0.s[3]\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" "ld1rqw { z0.s }, p0/Z, [x26]\n" "ld1rqw { z1.s }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[0]\n" + "fmla z12.s, z17.s, z1.s[0]\n" + "fmla z9.s, z16.s, z0.s[0]\n" + "fmla z13.s, z16.s, z1.s[0]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z17.s, z0.s[0]\n" + "fmla z14.s, z17.s, z1.s[0]\n" "addvl x10, x10, #4\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" + "fmla z11.s, z16.s, z0.s[0]\n" + "fmla z15.s, z16.s, z1.s[0]\n" "ble 24f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[1]\n" + "fmla z12.s, z17.s, z1.s[1]\n" + "fmla z9.s, z16.s, z0.s[1]\n" + "fmla z13.s, z16.s, z1.s[1]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" + "fmla z10.s, z17.s, z0.s[1]\n" + "fmla z14.s, z17.s, z1.s[1]\n" "addvl x10, x10, #4\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" + "fmla z11.s, z16.s, z0.s[1]\n" + "fmla z15.s, z16.s, z1.s[1]\n" "ble 24f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[2]\n" + "fmla z12.s, z17.s, z1.s[2]\n" + "fmla z9.s, z16.s, z0.s[2]\n" + "fmla z13.s, z16.s, z1.s[2]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" + "fmla z10.s, z17.s, z0.s[2]\n" + "fmla z14.s, z17.s, z1.s[2]\n" "addvl x10, x10, #4\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" + "fmla z11.s, z16.s, z0.s[2]\n" + "fmla z15.s, z16.s, z1.s[2]\n" "ble 24f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10]\n" + "ld1w { z16.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z17.s, z0.s[3]\n" + "fmla z12.s, z17.s, z1.s[3]\n" + "fmla z9.s, z16.s, z0.s[3]\n" + "fmla z13.s, z16.s, z1.s[3]\n" + "ld1w { z17.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z17.s, z0.s[3]\n" + "fmla z14.s, z17.s, z1.s[3]\n" "addvl x10, x10, #4\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" + "fmla z11.s, z16.s, z0.s[3]\n" + "fmla z15.s, z16.s, z1.s[3]\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -465,25 +465,25 @@ void sve_hybrid_fp32_mla_6x4VL ( "add x25, x9, x20, LSL #2\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z15.s, p5/M, z15.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z15.s, p5/M, z15.s, z16.s\n" "25:" // Height 2: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -531,20 +531,20 @@ void sve_hybrid_fp32_mla_6x4VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20]\n" + "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n" "b 31f\n" "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -564,13 +564,13 @@ void sve_hybrid_fp32_mla_6x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -579,86 +579,86 @@ void sve_hybrid_fp32_mla_6x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "34:" // Height 3: input setup done "cmp x27, #0x4\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" + "ld1rqw { z2.s }, p0/Z, [x26]\n" "ld1rqw { z1.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1rqw { z0.s }, p0/Z, [x24]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "fmla z8.s, z21.s, z2.s[0]\n" + "fmla z12.s, z21.s, z1.s[0]\n" + "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.s, z21.s, z0.s[0]\n" + "fmla z9.s, z20.s, z2.s[0]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[0]\n" + "fmla z17.s, z20.s, z0.s[0]\n" + "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n" "cmp x27, #0x4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" + "fmla z10.s, z21.s, z2.s[0]\n" + "fmla z14.s, z21.s, z1.s[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z18.s, z21.s, z0.s[0]\n" + "fmla z11.s, z20.s, z2.s[0]\n" + "ld1w { z21.s }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[0]\n" + "fmla z19.s, z20.s, z0.s[0]\n" + "ld1w { z20.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[1]\n" + "fmla z12.s, z21.s, z1.s[1]\n" + "fmla z16.s, z21.s, z0.s[1]\n" + "fmla z9.s, z20.s, z2.s[1]\n" + "ld1w { z21.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[1]\n" + "fmla z17.s, z20.s, z0.s[1]\n" + "ld1w { z20.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z10.s, z21.s, z2.s[1]\n" + "fmla z14.s, z21.s, z1.s[1]\n" + "fmla z18.s, z21.s, z0.s[1]\n" + "fmla z11.s, z20.s, z2.s[1]\n" + "ld1w { z21.s }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[1]\n" + "fmla z19.s, z20.s, z0.s[1]\n" + "ld1w { z20.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[2]\n" + "fmla z12.s, z21.s, z1.s[2]\n" + "fmla z16.s, z21.s, z0.s[2]\n" + "fmla z9.s, z20.s, z2.s[2]\n" + "ld1w { z21.s }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[2]\n" + "fmla z17.s, z20.s, z0.s[2]\n" + "ld1w { z20.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z21.s, z2.s[2]\n" + "fmla z14.s, z21.s, z1.s[2]\n" + "fmla z18.s, z21.s, z0.s[2]\n" + "fmla z11.s, z20.s, z2.s[2]\n" + "ld1w { z21.s }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.s, z20.s, z1.s[2]\n" + "fmla z19.s, z20.s, z0.s[2]\n" + "ld1w { z20.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z21.s, z2.s[3]\n" + "fmla z12.s, z21.s, z1.s[3]\n" + "fmla z16.s, z21.s, z0.s[3]\n" + "fmla z9.s, z20.s, z2.s[3]\n" + "ld1w { z21.s }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[3]\n" + "fmla z17.s, z20.s, z0.s[3]\n" + "ld1w { z20.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.s, z21.s, z2.s[3]\n" + "fmla z14.s, z21.s, z1.s[3]\n" + "fmla z18.s, z21.s, z0.s[3]\n" + "fmla z11.s, z20.s, z2.s[3]\n" + "fmla z15.s, z20.s, z1.s[3]\n" + "fmla z19.s, z20.s, z0.s[3]\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -666,79 +666,79 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z1.s }, p0/Z, [x25]\n" "subs x27, x27, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "fmla z8.s, z21.s, z0.s[0]\n" + "fmla z12.s, z21.s, z1.s[0]\n" + "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.s, z21.s, z2.s[0]\n" + "fmla z9.s, z20.s, z0.s[0]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[0]\n" + "fmla z17.s, z20.s, z2.s[0]\n" + "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" + "fmla z10.s, z21.s, z0.s[0]\n" + "fmla z14.s, z21.s, z1.s[0]\n" + "fmla z18.s, z21.s, z2.s[0]\n" + "fmla z11.s, z20.s, z0.s[0]\n" + "fmla z15.s, z20.s, z1.s[0]\n" + "fmla z19.s, z20.s, z2.s[0]\n" "ble 37f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z21.s, z0.s[1]\n" + "fmla z12.s, z21.s, z1.s[1]\n" + "fmla z16.s, z21.s, z2.s[1]\n" + "fmla z9.s, z20.s, z0.s[1]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[1]\n" + "fmla z17.s, z20.s, z2.s[1]\n" + "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" + "fmla z10.s, z21.s, z0.s[1]\n" + "fmla z14.s, z21.s, z1.s[1]\n" + "fmla z18.s, z21.s, z2.s[1]\n" + "fmla z11.s, z20.s, z0.s[1]\n" + "fmla z15.s, z20.s, z1.s[1]\n" + "fmla z19.s, z20.s, z2.s[1]\n" "ble 37f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z21.s, z0.s[2]\n" + "fmla z12.s, z21.s, z1.s[2]\n" + "fmla z16.s, z21.s, z2.s[2]\n" + "fmla z9.s, z20.s, z0.s[2]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[2]\n" + "fmla z17.s, z20.s, z2.s[2]\n" + "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" + "fmla z10.s, z21.s, z0.s[2]\n" + "fmla z14.s, z21.s, z1.s[2]\n" + "fmla z18.s, z21.s, z2.s[2]\n" + "fmla z11.s, z20.s, z0.s[2]\n" + "fmla z15.s, z20.s, z1.s[2]\n" + "fmla z19.s, z20.s, z2.s[2]\n" "ble 37f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z21.s }, p5/Z, [x10]\n" + "ld1w { z20.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z21.s, z0.s[3]\n" + "fmla z12.s, z21.s, z1.s[3]\n" + "fmla z16.s, z21.s, z2.s[3]\n" + "fmla z9.s, z20.s, z0.s[3]\n" + "ld1w { z21.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z20.s, z1.s[3]\n" + "fmla z17.s, z20.s, z2.s[3]\n" + "ld1w { z20.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" + "fmla z10.s, z21.s, z0.s[3]\n" + "fmla z14.s, z21.s, z1.s[3]\n" + "fmla z18.s, z21.s, z2.s[3]\n" + "fmla z11.s, z20.s, z0.s[3]\n" + "fmla z15.s, z20.s, z1.s[3]\n" + "fmla z19.s, z20.s, z2.s[3]\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -749,33 +749,33 @@ void sve_hybrid_fp32_mla_6x4VL ( "add x24, x25, x20, LSL #2\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z21.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z20.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z21.s\n" + "fmin z9.s, p5/M, z9.s, z21.s\n" + "fmin z10.s, p5/M, z10.s, z21.s\n" + "fmin z11.s, p5/M, z11.s, z21.s\n" + "fmin z12.s, p5/M, z12.s, z21.s\n" + "fmin z13.s, p5/M, z13.s, z21.s\n" + "fmin z14.s, p5/M, z14.s, z21.s\n" + "fmin z15.s, p5/M, z15.s, z21.s\n" + "fmin z16.s, p5/M, z16.s, z21.s\n" + "fmin z17.s, p5/M, z17.s, z21.s\n" + "fmin z18.s, p5/M, z18.s, z21.s\n" + "fmin z19.s, p5/M, z19.s, z21.s\n" + "fmax z8.s, p5/M, z8.s, z20.s\n" + "fmax z9.s, p5/M, z9.s, z20.s\n" + "fmax z10.s, p5/M, z10.s, z20.s\n" + "fmax z11.s, p5/M, z11.s, z20.s\n" + "fmax z12.s, p5/M, z12.s, z20.s\n" + "fmax z13.s, p5/M, z13.s, z20.s\n" + "fmax z14.s, p5/M, z14.s, z20.s\n" + "fmax z15.s, p5/M, z15.s, z20.s\n" + "fmax z16.s, p5/M, z16.s, z20.s\n" + "fmax z17.s, p5/M, z17.s, z20.s\n" + "fmax z18.s, p5/M, z18.s, z20.s\n" + "fmax z19.s, p5/M, z19.s, z20.s\n" "38:" // Height 3: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -831,25 +831,25 @@ void sve_hybrid_fp32_mla_6x4VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21]\n" + "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "b 44f\n" "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -873,14 +873,14 @@ void sve_hybrid_fp32_mla_6x4VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -890,105 +890,105 @@ void sve_hybrid_fp32_mla_6x4VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "47:" // Height 4: input setup done "cmp x27, #0x4\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z3.s }, p0/Z, [x26]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" + "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[0]\n" + "fmla z12.s, z25.s, z2.s[0]\n" + "fmla z16.s, z25.s, z1.s[0]\n" + "fmla z20.s, z25.s, z0.s[0]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" + "fmla z9.s, z24.s, z3.s[0]\n" + "fmla z13.s, z24.s, z2.s[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z17.s, z24.s, z1.s[0]\n" + "fmla z21.s, z24.s, z0.s[0]\n" + "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z25.s, z3.s[0]\n" + "fmla z14.s, z25.s, z2.s[0]\n" + "fmla z18.s, z25.s, z1.s[0]\n" + "fmla z22.s, z25.s, z0.s[0]\n" + "ld1w { z25.s }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[0]\n" + "fmla z15.s, z24.s, z2.s[0]\n" + "fmla z19.s, z24.s, z1.s[0]\n" + "fmla z23.s, z24.s, z0.s[0]\n" + "ld1w { z24.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[1]\n" + "fmla z12.s, z25.s, z2.s[1]\n" + "fmla z16.s, z25.s, z1.s[1]\n" + "fmla z20.s, z25.s, z0.s[1]\n" + "ld1w { z25.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[1]\n" + "fmla z13.s, z24.s, z2.s[1]\n" + "fmla z17.s, z24.s, z1.s[1]\n" + "fmla z21.s, z24.s, z0.s[1]\n" + "ld1w { z24.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z10.s, z25.s, z3.s[1]\n" + "fmla z14.s, z25.s, z2.s[1]\n" + "fmla z18.s, z25.s, z1.s[1]\n" + "fmla z22.s, z25.s, z0.s[1]\n" + "ld1w { z25.s }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[1]\n" + "fmla z15.s, z24.s, z2.s[1]\n" + "fmla z19.s, z24.s, z1.s[1]\n" + "fmla z23.s, z24.s, z0.s[1]\n" + "ld1w { z24.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[2]\n" + "fmla z12.s, z25.s, z2.s[2]\n" + "fmla z16.s, z25.s, z1.s[2]\n" + "fmla z20.s, z25.s, z0.s[2]\n" + "ld1w { z25.s }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[2]\n" + "fmla z13.s, z24.s, z2.s[2]\n" + "fmla z17.s, z24.s, z1.s[2]\n" + "fmla z21.s, z24.s, z0.s[2]\n" + "ld1w { z24.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z25.s, z3.s[2]\n" + "fmla z14.s, z25.s, z2.s[2]\n" + "fmla z18.s, z25.s, z1.s[2]\n" + "fmla z22.s, z25.s, z0.s[2]\n" + "ld1w { z25.s }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.s, z24.s, z3.s[2]\n" + "fmla z15.s, z24.s, z2.s[2]\n" + "fmla z19.s, z24.s, z1.s[2]\n" + "fmla z23.s, z24.s, z0.s[2]\n" + "ld1w { z24.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z25.s, z3.s[3]\n" + "fmla z12.s, z25.s, z2.s[3]\n" + "fmla z16.s, z25.s, z1.s[3]\n" + "fmla z20.s, z25.s, z0.s[3]\n" + "ld1w { z25.s }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.s, z24.s, z3.s[3]\n" + "fmla z13.s, z24.s, z2.s[3]\n" + "fmla z17.s, z24.s, z1.s[3]\n" + "fmla z21.s, z24.s, z0.s[3]\n" + "ld1w { z24.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.s, z25.s, z3.s[3]\n" + "fmla z14.s, z25.s, z2.s[3]\n" + "fmla z18.s, z25.s, z1.s[3]\n" + "fmla z22.s, z25.s, z0.s[3]\n" + "fmla z11.s, z24.s, z3.s[3]\n" + "fmla z15.s, z24.s, z2.s[3]\n" + "fmla z19.s, z24.s, z1.s[3]\n" + "fmla z23.s, z24.s, z0.s[3]\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -997,95 +997,95 @@ void sve_hybrid_fp32_mla_6x4VL ( "subs x27, x27, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" + "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z25.s, z0.s[0]\n" + "fmla z12.s, z25.s, z1.s[0]\n" + "fmla z16.s, z25.s, z2.s[0]\n" + "fmla z20.s, z25.s, z3.s[0]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z24.s, z0.s[0]\n" + "fmla z13.s, z24.s, z1.s[0]\n" + "fmla z17.s, z24.s, z2.s[0]\n" + "fmla z21.s, z24.s, z3.s[0]\n" + "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" + "fmla z10.s, z25.s, z0.s[0]\n" + "fmla z14.s, z25.s, z1.s[0]\n" + "fmla z18.s, z25.s, z2.s[0]\n" + "fmla z22.s, z25.s, z3.s[0]\n" + "fmla z11.s, z24.s, z0.s[0]\n" + "fmla z15.s, z24.s, z1.s[0]\n" + "fmla z19.s, z24.s, z2.s[0]\n" + "fmla z23.s, z24.s, z3.s[0]\n" "ble 50f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" + "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z25.s, z0.s[1]\n" + "fmla z12.s, z25.s, z1.s[1]\n" + "fmla z16.s, z25.s, z2.s[1]\n" + "fmla z20.s, z25.s, z3.s[1]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.s, z24.s, z0.s[1]\n" + "fmla z13.s, z24.s, z1.s[1]\n" + "fmla z17.s, z24.s, z2.s[1]\n" + "fmla z21.s, z24.s, z3.s[1]\n" + "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" + "fmla z10.s, z25.s, z0.s[1]\n" + "fmla z14.s, z25.s, z1.s[1]\n" + "fmla z18.s, z25.s, z2.s[1]\n" + "fmla z22.s, z25.s, z3.s[1]\n" + "fmla z11.s, z24.s, z0.s[1]\n" + "fmla z15.s, z24.s, z1.s[1]\n" + "fmla z19.s, z24.s, z2.s[1]\n" + "fmla z23.s, z24.s, z3.s[1]\n" "ble 50f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" + "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z25.s, z0.s[2]\n" + "fmla z12.s, z25.s, z1.s[2]\n" + "fmla z16.s, z25.s, z2.s[2]\n" + "fmla z20.s, z25.s, z3.s[2]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x1\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z9.s, z24.s, z0.s[2]\n" + "fmla z13.s, z24.s, z1.s[2]\n" + "fmla z17.s, z24.s, z2.s[2]\n" + "fmla z21.s, z24.s, z3.s[2]\n" + "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" + "fmla z10.s, z25.s, z0.s[2]\n" + "fmla z14.s, z25.s, z1.s[2]\n" + "fmla z18.s, z25.s, z2.s[2]\n" + "fmla z22.s, z25.s, z3.s[2]\n" + "fmla z11.s, z24.s, z0.s[2]\n" + "fmla z15.s, z24.s, z1.s[2]\n" + "fmla z19.s, z24.s, z2.s[2]\n" + "fmla z23.s, z24.s, z3.s[2]\n" "ble 50f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z25.s }, p5/Z, [x10]\n" + "ld1w { z24.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z25.s, z0.s[3]\n" + "fmla z12.s, z25.s, z1.s[3]\n" + "fmla z16.s, z25.s, z2.s[3]\n" + "fmla z20.s, z25.s, z3.s[3]\n" + "ld1w { z25.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z24.s, z0.s[3]\n" + "fmla z13.s, z24.s, z1.s[3]\n" + "fmla z17.s, z24.s, z2.s[3]\n" + "fmla z21.s, z24.s, z3.s[3]\n" + "ld1w { z24.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" + "fmla z10.s, z25.s, z0.s[3]\n" + "fmla z14.s, z25.s, z1.s[3]\n" + "fmla z18.s, z25.s, z2.s[3]\n" + "fmla z22.s, z25.s, z3.s[3]\n" + "fmla z11.s, z24.s, z0.s[3]\n" + "fmla z15.s, z24.s, z1.s[3]\n" + "fmla z19.s, z24.s, z2.s[3]\n" + "fmla z23.s, z24.s, z3.s[3]\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1097,41 +1097,41 @@ void sve_hybrid_fp32_mla_6x4VL ( "add x23, x24, x20, LSL #2\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z15.s, p5/M, z15.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmin z20.s, p5/M, z20.s, z25.s\n" + "fmin z21.s, p5/M, z21.s, z25.s\n" + "fmin z22.s, p5/M, z22.s, z25.s\n" + "fmin z23.s, p5/M, z23.s, z25.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z15.s, p5/M, z15.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" + "fmax z20.s, p5/M, z20.s, z24.s\n" + "fmax z21.s, p5/M, z21.s, z24.s\n" + "fmax z22.s, p5/M, z22.s, z24.s\n" + "fmax z23.s, p5/M, z23.s, z24.s\n" "51:" // Height 4: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -1195,30 +1195,30 @@ void sve_hybrid_fp32_mla_6x4VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1246,15 +1246,15 @@ void sve_hybrid_fp32_mla_6x4VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1265,124 +1265,124 @@ void sve_hybrid_fp32_mla_6x4VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "60:" // Height 5: input setup done "cmp x27, #0x4\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" + "ld1rqw { z3.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1rqw { z0.s }, p0/Z, [x22]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" + "fmla z8.s, z29.s, z4.s[0]\n" + "fmla z12.s, z29.s, z3.s[0]\n" + "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.s, z29.s, z2.s[0]\n" + "fmla z20.s, z29.s, z1.s[0]\n" "add x25, x25, #0x10\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z24.s, z29.s, z0.s[0]\n" + "fmla z9.s, z28.s, z4.s[0]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" "add x24, x24, #0x10\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" + "fmla z13.s, z28.s, z3.s[0]\n" + "fmla z17.s, z28.s, z2.s[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z21.s, z28.s, z1.s[0]\n" + "fmla z25.s, z28.s, z0.s[0]\n" + "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z29.s, z4.s[0]\n" + "fmla z14.s, z29.s, z3.s[0]\n" + "fmla z18.s, z29.s, z2.s[0]\n" + "fmla z22.s, z29.s, z1.s[0]\n" + "fmla z26.s, z29.s, z0.s[0]\n" + "fmla z11.s, z28.s, z4.s[0]\n" + "ld1w { z29.s }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[0]\n" + "fmla z19.s, z28.s, z2.s[0]\n" + "fmla z23.s, z28.s, z1.s[0]\n" + "fmla z27.s, z28.s, z0.s[0]\n" + "ld1w { z28.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[1]\n" + "fmla z12.s, z29.s, z3.s[1]\n" + "fmla z16.s, z29.s, z2.s[1]\n" + "fmla z20.s, z29.s, z1.s[1]\n" + "fmla z24.s, z29.s, z0.s[1]\n" + "fmla z9.s, z28.s, z4.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[1]\n" + "fmla z17.s, z28.s, z2.s[1]\n" + "fmla z21.s, z28.s, z1.s[1]\n" + "fmla z25.s, z28.s, z0.s[1]\n" + "ld1w { z28.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z10.s, z29.s, z4.s[1]\n" + "fmla z14.s, z29.s, z3.s[1]\n" + "fmla z18.s, z29.s, z2.s[1]\n" + "fmla z22.s, z29.s, z1.s[1]\n" + "fmla z26.s, z29.s, z0.s[1]\n" + "fmla z11.s, z28.s, z4.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[1]\n" + "fmla z19.s, z28.s, z2.s[1]\n" + "fmla z23.s, z28.s, z1.s[1]\n" + "fmla z27.s, z28.s, z0.s[1]\n" + "ld1w { z28.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[2]\n" + "fmla z12.s, z29.s, z3.s[2]\n" + "fmla z16.s, z29.s, z2.s[2]\n" + "fmla z20.s, z29.s, z1.s[2]\n" + "fmla z24.s, z29.s, z0.s[2]\n" + "fmla z9.s, z28.s, z4.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[2]\n" + "fmla z17.s, z28.s, z2.s[2]\n" + "fmla z21.s, z28.s, z1.s[2]\n" + "fmla z25.s, z28.s, z0.s[2]\n" + "ld1w { z28.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z29.s, z4.s[2]\n" + "fmla z14.s, z29.s, z3.s[2]\n" + "fmla z18.s, z29.s, z2.s[2]\n" + "fmla z22.s, z29.s, z1.s[2]\n" + "fmla z26.s, z29.s, z0.s[2]\n" + "fmla z11.s, z28.s, z4.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z15.s, z28.s, z3.s[2]\n" + "fmla z19.s, z28.s, z2.s[2]\n" + "fmla z23.s, z28.s, z1.s[2]\n" + "fmla z27.s, z28.s, z0.s[2]\n" + "ld1w { z28.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z29.s, z4.s[3]\n" + "fmla z12.s, z29.s, z3.s[3]\n" + "fmla z16.s, z29.s, z2.s[3]\n" + "fmla z20.s, z29.s, z1.s[3]\n" + "fmla z24.s, z29.s, z0.s[3]\n" + "fmla z9.s, z28.s, z4.s[3]\n" + "ld1w { z29.s }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z13.s, z28.s, z3.s[3]\n" + "fmla z17.s, z28.s, z2.s[3]\n" + "fmla z21.s, z28.s, z1.s[3]\n" + "fmla z25.s, z28.s, z0.s[3]\n" + "ld1w { z28.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.s, z29.s, z4.s[3]\n" + "fmla z14.s, z29.s, z3.s[3]\n" + "fmla z18.s, z29.s, z2.s[3]\n" + "fmla z22.s, z29.s, z1.s[3]\n" + "fmla z26.s, z29.s, z0.s[3]\n" + "fmla z11.s, z28.s, z4.s[3]\n" + "fmla z15.s, z28.s, z3.s[3]\n" + "fmla z19.s, z28.s, z2.s[3]\n" + "fmla z23.s, z28.s, z1.s[3]\n" + "fmla z27.s, z28.s, z0.s[3]\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -1392,111 +1392,111 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z2.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" + "fmla z8.s, z29.s, z0.s[0]\n" + "fmla z12.s, z29.s, z1.s[0]\n" + "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z16.s, z29.s, z2.s[0]\n" + "fmla z20.s, z29.s, z3.s[0]\n" + "fmla z24.s, z29.s, z4.s[0]\n" + "fmla z9.s, z28.s, z0.s[0]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z28.s, z1.s[0]\n" + "fmla z17.s, z28.s, z2.s[0]\n" + "fmla z21.s, z28.s, z3.s[0]\n" + "fmla z25.s, z28.s, z4.s[0]\n" + "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" + "fmla z10.s, z29.s, z0.s[0]\n" + "fmla z14.s, z29.s, z1.s[0]\n" + "fmla z18.s, z29.s, z2.s[0]\n" + "fmla z22.s, z29.s, z3.s[0]\n" + "fmla z26.s, z29.s, z4.s[0]\n" + "fmla z11.s, z28.s, z0.s[0]\n" + "fmla z15.s, z28.s, z1.s[0]\n" + "fmla z19.s, z28.s, z2.s[0]\n" + "fmla z23.s, z28.s, z3.s[0]\n" + "fmla z27.s, z28.s, z4.s[0]\n" "ble 63f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" + "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z29.s, z0.s[1]\n" + "fmla z12.s, z29.s, z1.s[1]\n" + "fmla z16.s, z29.s, z2.s[1]\n" + "fmla z20.s, z29.s, z3.s[1]\n" "subs x27, x27, #0x1\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.s, z29.s, z4.s[1]\n" + "fmla z9.s, z28.s, z0.s[1]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z28.s, z1.s[1]\n" + "fmla z17.s, z28.s, z2.s[1]\n" + "fmla z21.s, z28.s, z3.s[1]\n" + "fmla z25.s, z28.s, z4.s[1]\n" + "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" + "fmla z10.s, z29.s, z0.s[1]\n" + "fmla z14.s, z29.s, z1.s[1]\n" + "fmla z18.s, z29.s, z2.s[1]\n" + "fmla z22.s, z29.s, z3.s[1]\n" + "fmla z26.s, z29.s, z4.s[1]\n" + "fmla z11.s, z28.s, z0.s[1]\n" + "fmla z15.s, z28.s, z1.s[1]\n" + "fmla z19.s, z28.s, z2.s[1]\n" + "fmla z23.s, z28.s, z3.s[1]\n" + "fmla z27.s, z28.s, z4.s[1]\n" "ble 63f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" + "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z29.s, z0.s[2]\n" + "fmla z12.s, z29.s, z1.s[2]\n" + "fmla z16.s, z29.s, z2.s[2]\n" + "fmla z20.s, z29.s, z3.s[2]\n" "subs x27, x27, #0x1\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.s, z29.s, z4.s[2]\n" + "fmla z9.s, z28.s, z0.s[2]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z28.s, z1.s[2]\n" + "fmla z17.s, z28.s, z2.s[2]\n" + "fmla z21.s, z28.s, z3.s[2]\n" + "fmla z25.s, z28.s, z4.s[2]\n" + "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" + "fmla z10.s, z29.s, z0.s[2]\n" + "fmla z14.s, z29.s, z1.s[2]\n" + "fmla z18.s, z29.s, z2.s[2]\n" + "fmla z22.s, z29.s, z3.s[2]\n" + "fmla z26.s, z29.s, z4.s[2]\n" + "fmla z11.s, z28.s, z0.s[2]\n" + "fmla z15.s, z28.s, z1.s[2]\n" + "fmla z19.s, z28.s, z2.s[2]\n" + "fmla z23.s, z28.s, z3.s[2]\n" + "fmla z27.s, z28.s, z4.s[2]\n" "ble 63f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z29.s }, p5/Z, [x10]\n" + "ld1w { z28.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z29.s, z0.s[3]\n" + "fmla z12.s, z29.s, z1.s[3]\n" + "fmla z16.s, z29.s, z2.s[3]\n" + "fmla z20.s, z29.s, z3.s[3]\n" + "fmla z24.s, z29.s, z4.s[3]\n" + "fmla z9.s, z28.s, z0.s[3]\n" + "ld1w { z29.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z13.s, z28.s, z1.s[3]\n" + "fmla z17.s, z28.s, z2.s[3]\n" + "fmla z21.s, z28.s, z3.s[3]\n" + "fmla z25.s, z28.s, z4.s[3]\n" + "ld1w { z28.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" + "fmla z10.s, z29.s, z0.s[3]\n" + "fmla z14.s, z29.s, z1.s[3]\n" + "fmla z18.s, z29.s, z2.s[3]\n" + "fmla z22.s, z29.s, z3.s[3]\n" + "fmla z26.s, z29.s, z4.s[3]\n" + "fmla z11.s, z28.s, z0.s[3]\n" + "fmla z15.s, z28.s, z1.s[3]\n" + "fmla z19.s, z28.s, z2.s[3]\n" + "fmla z23.s, z28.s, z3.s[3]\n" + "fmla z27.s, z28.s, z4.s[3]\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1509,49 +1509,49 @@ void sve_hybrid_fp32_mla_6x4VL ( "add x22, x23, x20, LSL #2\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z29.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z23.s, p5/M, z23.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z23.s, p5/M, z23.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "ld1rw { z28.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z29.s\n" + "fmin z9.s, p5/M, z9.s, z29.s\n" + "fmin z10.s, p5/M, z10.s, z29.s\n" + "fmin z11.s, p5/M, z11.s, z29.s\n" + "fmin z12.s, p5/M, z12.s, z29.s\n" + "fmin z13.s, p5/M, z13.s, z29.s\n" + "fmin z14.s, p5/M, z14.s, z29.s\n" + "fmin z15.s, p5/M, z15.s, z29.s\n" + "fmin z16.s, p5/M, z16.s, z29.s\n" + "fmin z17.s, p5/M, z17.s, z29.s\n" + "fmin z18.s, p5/M, z18.s, z29.s\n" + "fmin z19.s, p5/M, z19.s, z29.s\n" + "fmin z20.s, p5/M, z20.s, z29.s\n" + "fmin z21.s, p5/M, z21.s, z29.s\n" + "fmin z22.s, p5/M, z22.s, z29.s\n" + "fmin z23.s, p5/M, z23.s, z29.s\n" + "fmin z24.s, p5/M, z24.s, z29.s\n" + "fmin z25.s, p5/M, z25.s, z29.s\n" + "fmin z26.s, p5/M, z26.s, z29.s\n" + "fmin z27.s, p5/M, z27.s, z29.s\n" + "fmax z8.s, p5/M, z8.s, z28.s\n" + "fmax z9.s, p5/M, z9.s, z28.s\n" + "fmax z10.s, p5/M, z10.s, z28.s\n" + "fmax z11.s, p5/M, z11.s, z28.s\n" + "fmax z12.s, p5/M, z12.s, z28.s\n" + "fmax z13.s, p5/M, z13.s, z28.s\n" + "fmax z14.s, p5/M, z14.s, z28.s\n" + "fmax z15.s, p5/M, z15.s, z28.s\n" + "fmax z16.s, p5/M, z16.s, z28.s\n" + "fmax z17.s, p5/M, z17.s, z28.s\n" + "fmax z18.s, p5/M, z18.s, z28.s\n" + "fmax z19.s, p5/M, z19.s, z28.s\n" + "fmax z20.s, p5/M, z20.s, z28.s\n" + "fmax z21.s, p5/M, z21.s, z28.s\n" + "fmax z22.s, p5/M, z22.s, z28.s\n" + "fmax z23.s, p5/M, z23.s, z28.s\n" + "fmax z24.s, p5/M, z24.s, z28.s\n" + "fmax z25.s, p5/M, z25.s, z28.s\n" + "fmax z26.s, p5/M, z26.s, z28.s\n" + "fmax z27.s, p5/M, z27.s, z28.s\n" "64:" // Height 5: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -1626,35 +1626,35 @@ void sve_hybrid_fp32_mla_6x4VL ( "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" + "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" - "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x24]\n" - "ld1w { z17.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x22]\n" - "ld1w { z25.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x23]\n" + "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x21]\n" + "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1686,16 +1686,16 @@ void sve_hybrid_fp32_mla_6x4VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1707,143 +1707,143 @@ void sve_hybrid_fp32_mla_6x4VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "73:" // Height 6: input setup done "cmp x27, #0x4\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" + "ld1rqw { z7.s }, p0/Z, [x26]\n" + "ld1rqw { z6.s }, p0/Z, [x25]\n" "sub x27, x27, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1rqw { z5.s }, p0/Z, [x21]\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" + "ld1w { z1.s }, p5/Z, [x10]\n" + "ld1w { z0.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[0]\n" + "fmla z12.s, z1.s, z6.s[0]\n" + "fmla z16.s, z1.s, z5.s[0]\n" + "fmla z20.s, z1.s, z4.s[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z24.s, z1.s, z3.s[0]\n" + "fmla z28.s, z1.s, z2.s[0]\n" + "ld1w { z1.s }, p5/Z, [x10, #2, MUL VL]\n" "add x21, x21, #0x10\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z30.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "fmla z31.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[0]\n" + "fmla z13.s, z0.s, z6.s[0]\n" + "fmla z17.s, z0.s, z5.s[0]\n" + "fmla z21.s, z0.s, z4.s[0]\n" + "fmla z25.s, z0.s, z3.s[0]\n" + "fmla z29.s, z0.s, z2.s[0]\n" + "ld1w { z0.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z10.s, z1.s, z7.s[0]\n" + "fmla z14.s, z1.s, z6.s[0]\n" + "fmla z18.s, z1.s, z5.s[0]\n" + "fmla z22.s, z1.s, z4.s[0]\n" + "fmla z26.s, z1.s, z3.s[0]\n" + "fmla z30.s, z1.s, z2.s[0]\n" + "ld1w { z1.s }, p5/Z, [x10, #4, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[0]\n" + "fmla z15.s, z0.s, z6.s[0]\n" + "fmla z19.s, z0.s, z5.s[0]\n" + "fmla z23.s, z0.s, z4.s[0]\n" + "fmla z27.s, z0.s, z3.s[0]\n" + "fmla z31.s, z0.s, z2.s[0]\n" + "ld1w { z0.s }, p5/Z, [x10, #5, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[1]\n" + "fmla z12.s, z1.s, z6.s[1]\n" + "fmla z16.s, z1.s, z5.s[1]\n" + "fmla z20.s, z1.s, z4.s[1]\n" + "fmla z24.s, z1.s, z3.s[1]\n" + "fmla z28.s, z1.s, z2.s[1]\n" + "ld1w { z1.s }, p5/Z, [x10, #6, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[1]\n" + "fmla z13.s, z0.s, z6.s[1]\n" + "fmla z17.s, z0.s, z5.s[1]\n" + "fmla z21.s, z0.s, z4.s[1]\n" + "fmla z25.s, z0.s, z3.s[1]\n" + "fmla z29.s, z0.s, z2.s[1]\n" + "ld1w { z0.s }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z30.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "fmla z31.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z30.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "fmla z31.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z30.s, z6.s, z5.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" - "fmla z31.s, z7.s, z5.s[3]\n" + "fmla z10.s, z1.s, z7.s[1]\n" + "fmla z14.s, z1.s, z6.s[1]\n" + "fmla z18.s, z1.s, z5.s[1]\n" + "fmla z22.s, z1.s, z4.s[1]\n" + "fmla z26.s, z1.s, z3.s[1]\n" + "fmla z30.s, z1.s, z2.s[1]\n" + "ld1w { z1.s }, p5/Z, [x10, #-8, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[1]\n" + "fmla z15.s, z0.s, z6.s[1]\n" + "fmla z19.s, z0.s, z5.s[1]\n" + "fmla z23.s, z0.s, z4.s[1]\n" + "fmla z27.s, z0.s, z3.s[1]\n" + "fmla z31.s, z0.s, z2.s[1]\n" + "ld1w { z0.s }, p5/Z, [x10, #-7, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[2]\n" + "fmla z12.s, z1.s, z6.s[2]\n" + "fmla z16.s, z1.s, z5.s[2]\n" + "fmla z20.s, z1.s, z4.s[2]\n" + "fmla z24.s, z1.s, z3.s[2]\n" + "fmla z28.s, z1.s, z2.s[2]\n" + "ld1w { z1.s }, p5/Z, [x10, #-6, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[2]\n" + "fmla z13.s, z0.s, z6.s[2]\n" + "fmla z17.s, z0.s, z5.s[2]\n" + "fmla z21.s, z0.s, z4.s[2]\n" + "fmla z25.s, z0.s, z3.s[2]\n" + "fmla z29.s, z0.s, z2.s[2]\n" + "ld1w { z0.s }, p5/Z, [x10, #-5, MUL VL]\n" + "fmla z10.s, z1.s, z7.s[2]\n" + "fmla z14.s, z1.s, z6.s[2]\n" + "fmla z18.s, z1.s, z5.s[2]\n" + "fmla z22.s, z1.s, z4.s[2]\n" + "fmla z26.s, z1.s, z3.s[2]\n" + "fmla z30.s, z1.s, z2.s[2]\n" + "ld1w { z1.s }, p5/Z, [x10, #-4, MUL VL]\n" + "fmla z11.s, z0.s, z7.s[2]\n" + "fmla z15.s, z0.s, z6.s[2]\n" + "fmla z19.s, z0.s, z5.s[2]\n" + "fmla z23.s, z0.s, z4.s[2]\n" + "fmla z27.s, z0.s, z3.s[2]\n" + "fmla z31.s, z0.s, z2.s[2]\n" + "ld1w { z0.s }, p5/Z, [x10, #-3, MUL VL]\n" + "fmla z8.s, z1.s, z7.s[3]\n" + "fmla z12.s, z1.s, z6.s[3]\n" + "fmla z16.s, z1.s, z5.s[3]\n" + "fmla z20.s, z1.s, z4.s[3]\n" + "fmla z24.s, z1.s, z3.s[3]\n" + "fmla z28.s, z1.s, z2.s[3]\n" + "ld1w { z1.s }, p5/Z, [x10, #-2, MUL VL]\n" + "fmla z9.s, z0.s, z7.s[3]\n" + "fmla z13.s, z0.s, z6.s[3]\n" + "fmla z17.s, z0.s, z5.s[3]\n" + "fmla z21.s, z0.s, z4.s[3]\n" + "fmla z25.s, z0.s, z3.s[3]\n" + "fmla z29.s, z0.s, z2.s[3]\n" + "ld1w { z0.s }, p5/Z, [x10, #-1, MUL VL]\n" + "fmla z10.s, z1.s, z7.s[3]\n" + "fmla z14.s, z1.s, z6.s[3]\n" + "fmla z18.s, z1.s, z5.s[3]\n" + "fmla z22.s, z1.s, z4.s[3]\n" + "fmla z26.s, z1.s, z3.s[3]\n" + "fmla z30.s, z1.s, z2.s[3]\n" + "fmla z11.s, z0.s, z7.s[3]\n" + "fmla z15.s, z0.s, z6.s[3]\n" + "fmla z19.s, z0.s, z5.s[3]\n" + "fmla z23.s, z0.s, z4.s[3]\n" + "fmla z27.s, z0.s, z3.s[3]\n" + "fmla z31.s, z0.s, z2.s[3]\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" @@ -1854,127 +1854,127 @@ void sve_hybrid_fp32_mla_6x4VL ( "ld1rqw { z3.s }, p0/Z, [x23]\n" "ld1rqw { z4.s }, p0/Z, [x22]\n" "ld1rqw { z5.s }, p0/Z, [x21]\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[0]\n" - "fmla z12.s, z6.s, z1.s[0]\n" - "fmla z16.s, z6.s, z2.s[0]\n" - "fmla z20.s, z6.s, z3.s[0]\n" - "fmla z24.s, z6.s, z4.s[0]\n" - "fmla z28.s, z6.s, z5.s[0]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[0]\n" - "fmla z13.s, z7.s, z1.s[0]\n" - "fmla z17.s, z7.s, z2.s[0]\n" - "fmla z21.s, z7.s, z3.s[0]\n" - "fmla z25.s, z7.s, z4.s[0]\n" - "fmla z29.s, z7.s, z5.s[0]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" + "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z7.s, z0.s[0]\n" + "fmla z12.s, z7.s, z1.s[0]\n" + "fmla z16.s, z7.s, z2.s[0]\n" + "fmla z20.s, z7.s, z3.s[0]\n" + "fmla z24.s, z7.s, z4.s[0]\n" + "fmla z28.s, z7.s, z5.s[0]\n" + "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z6.s, z0.s[0]\n" + "fmla z13.s, z6.s, z1.s[0]\n" + "fmla z17.s, z6.s, z2.s[0]\n" + "fmla z21.s, z6.s, z3.s[0]\n" + "fmla z25.s, z6.s, z4.s[0]\n" + "fmla z29.s, z6.s, z5.s[0]\n" + "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[0]\n" - "fmla z14.s, z6.s, z1.s[0]\n" - "fmla z18.s, z6.s, z2.s[0]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z26.s, z6.s, z4.s[0]\n" - "fmla z30.s, z6.s, z5.s[0]\n" - "fmla z11.s, z7.s, z0.s[0]\n" - "fmla z15.s, z7.s, z1.s[0]\n" - "fmla z19.s, z7.s, z2.s[0]\n" - "fmla z23.s, z7.s, z3.s[0]\n" - "fmla z27.s, z7.s, z4.s[0]\n" - "fmla z31.s, z7.s, z5.s[0]\n" + "fmla z10.s, z7.s, z0.s[0]\n" + "fmla z14.s, z7.s, z1.s[0]\n" + "fmla z18.s, z7.s, z2.s[0]\n" + "fmla z22.s, z7.s, z3.s[0]\n" + "fmla z26.s, z7.s, z4.s[0]\n" + "fmla z30.s, z7.s, z5.s[0]\n" + "fmla z11.s, z6.s, z0.s[0]\n" + "fmla z15.s, z6.s, z1.s[0]\n" + "fmla z19.s, z6.s, z2.s[0]\n" + "fmla z23.s, z6.s, z3.s[0]\n" + "fmla z27.s, z6.s, z4.s[0]\n" + "fmla z31.s, z6.s, z5.s[0]\n" "ble 76f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[1]\n" - "fmla z12.s, z6.s, z1.s[1]\n" - "fmla z16.s, z6.s, z2.s[1]\n" - "fmla z20.s, z6.s, z3.s[1]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" + "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z7.s, z0.s[1]\n" + "fmla z12.s, z7.s, z1.s[1]\n" + "fmla z16.s, z7.s, z2.s[1]\n" + "fmla z20.s, z7.s, z3.s[1]\n" "subs x27, x27, #0x1\n" - "fmla z24.s, z6.s, z4.s[1]\n" - "fmla z28.s, z6.s, z5.s[1]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[1]\n" - "fmla z13.s, z7.s, z1.s[1]\n" - "fmla z17.s, z7.s, z2.s[1]\n" - "fmla z21.s, z7.s, z3.s[1]\n" - "fmla z25.s, z7.s, z4.s[1]\n" - "fmla z29.s, z7.s, z5.s[1]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.s, z7.s, z4.s[1]\n" + "fmla z28.s, z7.s, z5.s[1]\n" + "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z6.s, z0.s[1]\n" + "fmla z13.s, z6.s, z1.s[1]\n" + "fmla z17.s, z6.s, z2.s[1]\n" + "fmla z21.s, z6.s, z3.s[1]\n" + "fmla z25.s, z6.s, z4.s[1]\n" + "fmla z29.s, z6.s, z5.s[1]\n" + "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[1]\n" - "fmla z14.s, z6.s, z1.s[1]\n" - "fmla z18.s, z6.s, z2.s[1]\n" - "fmla z22.s, z6.s, z3.s[1]\n" - "fmla z26.s, z6.s, z4.s[1]\n" - "fmla z30.s, z6.s, z5.s[1]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z15.s, z7.s, z1.s[1]\n" - "fmla z19.s, z7.s, z2.s[1]\n" - "fmla z23.s, z7.s, z3.s[1]\n" - "fmla z27.s, z7.s, z4.s[1]\n" - "fmla z31.s, z7.s, z5.s[1]\n" + "fmla z10.s, z7.s, z0.s[1]\n" + "fmla z14.s, z7.s, z1.s[1]\n" + "fmla z18.s, z7.s, z2.s[1]\n" + "fmla z22.s, z7.s, z3.s[1]\n" + "fmla z26.s, z7.s, z4.s[1]\n" + "fmla z30.s, z7.s, z5.s[1]\n" + "fmla z11.s, z6.s, z0.s[1]\n" + "fmla z15.s, z6.s, z1.s[1]\n" + "fmla z19.s, z6.s, z2.s[1]\n" + "fmla z23.s, z6.s, z3.s[1]\n" + "fmla z27.s, z6.s, z4.s[1]\n" + "fmla z31.s, z6.s, z5.s[1]\n" "ble 76f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[2]\n" - "fmla z12.s, z6.s, z1.s[2]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z20.s, z6.s, z3.s[2]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" + "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z7.s, z0.s[2]\n" + "fmla z12.s, z7.s, z1.s[2]\n" + "fmla z16.s, z7.s, z2.s[2]\n" + "fmla z20.s, z7.s, z3.s[2]\n" "subs x27, x27, #0x1\n" - "fmla z24.s, z6.s, z4.s[2]\n" - "fmla z28.s, z6.s, z5.s[2]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[2]\n" - "fmla z13.s, z7.s, z1.s[2]\n" - "fmla z17.s, z7.s, z2.s[2]\n" - "fmla z21.s, z7.s, z3.s[2]\n" - "fmla z25.s, z7.s, z4.s[2]\n" - "fmla z29.s, z7.s, z5.s[2]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "fmla z24.s, z7.s, z4.s[2]\n" + "fmla z28.s, z7.s, z5.s[2]\n" + "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z6.s, z0.s[2]\n" + "fmla z13.s, z6.s, z1.s[2]\n" + "fmla z17.s, z6.s, z2.s[2]\n" + "fmla z21.s, z6.s, z3.s[2]\n" + "fmla z25.s, z6.s, z4.s[2]\n" + "fmla z29.s, z6.s, z5.s[2]\n" + "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[2]\n" - "fmla z14.s, z6.s, z1.s[2]\n" - "fmla z18.s, z6.s, z2.s[2]\n" - "fmla z22.s, z6.s, z3.s[2]\n" - "fmla z26.s, z6.s, z4.s[2]\n" - "fmla z30.s, z6.s, z5.s[2]\n" - "fmla z11.s, z7.s, z0.s[2]\n" - "fmla z15.s, z7.s, z1.s[2]\n" - "fmla z19.s, z7.s, z2.s[2]\n" - "fmla z23.s, z7.s, z3.s[2]\n" - "fmla z27.s, z7.s, z4.s[2]\n" - "fmla z31.s, z7.s, z5.s[2]\n" + "fmla z10.s, z7.s, z0.s[2]\n" + "fmla z14.s, z7.s, z1.s[2]\n" + "fmla z18.s, z7.s, z2.s[2]\n" + "fmla z22.s, z7.s, z3.s[2]\n" + "fmla z26.s, z7.s, z4.s[2]\n" + "fmla z30.s, z7.s, z5.s[2]\n" + "fmla z11.s, z6.s, z0.s[2]\n" + "fmla z15.s, z6.s, z1.s[2]\n" + "fmla z19.s, z6.s, z2.s[2]\n" + "fmla z23.s, z6.s, z3.s[2]\n" + "fmla z27.s, z6.s, z4.s[2]\n" + "fmla z31.s, z6.s, z5.s[2]\n" "ble 76f\n" - "ld1w { z6.s }, p5/Z, [x10]\n" - "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n" - "fmla z8.s, z6.s, z0.s[3]\n" - "fmla z12.s, z6.s, z1.s[3]\n" - "fmla z16.s, z6.s, z2.s[3]\n" - "fmla z20.s, z6.s, z3.s[3]\n" - "fmla z24.s, z6.s, z4.s[3]\n" - "fmla z28.s, z6.s, z5.s[3]\n" - "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n" - "fmla z9.s, z7.s, z0.s[3]\n" - "fmla z13.s, z7.s, z1.s[3]\n" - "fmla z17.s, z7.s, z2.s[3]\n" - "fmla z21.s, z7.s, z3.s[3]\n" - "fmla z25.s, z7.s, z4.s[3]\n" - "fmla z29.s, z7.s, z5.s[3]\n" - "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n" + "ld1w { z7.s }, p5/Z, [x10]\n" + "ld1w { z6.s }, p5/Z, [x10, #1, MUL VL]\n" + "fmla z8.s, z7.s, z0.s[3]\n" + "fmla z12.s, z7.s, z1.s[3]\n" + "fmla z16.s, z7.s, z2.s[3]\n" + "fmla z20.s, z7.s, z3.s[3]\n" + "fmla z24.s, z7.s, z4.s[3]\n" + "fmla z28.s, z7.s, z5.s[3]\n" + "ld1w { z7.s }, p5/Z, [x10, #2, MUL VL]\n" + "fmla z9.s, z6.s, z0.s[3]\n" + "fmla z13.s, z6.s, z1.s[3]\n" + "fmla z17.s, z6.s, z2.s[3]\n" + "fmla z21.s, z6.s, z3.s[3]\n" + "fmla z25.s, z6.s, z4.s[3]\n" + "fmla z29.s, z6.s, z5.s[3]\n" + "ld1w { z6.s }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "fmla z10.s, z6.s, z0.s[3]\n" - "fmla z14.s, z6.s, z1.s[3]\n" - "fmla z18.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[3]\n" - "fmla z26.s, z6.s, z4.s[3]\n" - "fmla z30.s, z6.s, z5.s[3]\n" - "fmla z11.s, z7.s, z0.s[3]\n" - "fmla z15.s, z7.s, z1.s[3]\n" - "fmla z19.s, z7.s, z2.s[3]\n" - "fmla z23.s, z7.s, z3.s[3]\n" - "fmla z27.s, z7.s, z4.s[3]\n" - "fmla z31.s, z7.s, z5.s[3]\n" + "fmla z10.s, z7.s, z0.s[3]\n" + "fmla z14.s, z7.s, z1.s[3]\n" + "fmla z18.s, z7.s, z2.s[3]\n" + "fmla z22.s, z7.s, z3.s[3]\n" + "fmla z26.s, z7.s, z4.s[3]\n" + "fmla z30.s, z7.s, z5.s[3]\n" + "fmla z11.s, z6.s, z0.s[3]\n" + "fmla z15.s, z6.s, z1.s[3]\n" + "fmla z19.s, z6.s, z2.s[3]\n" + "fmla z23.s, z6.s, z3.s[3]\n" + "fmla z27.s, z6.s, z4.s[3]\n" + "fmla z31.s, z6.s, z5.s[3]\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -2081,7 +2081,6 @@ void sve_hybrid_fp32_mla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -2089,4 +2088,4 @@ void sve_hybrid_fp32_mla_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp index c0718b1e75..a353c9d660 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #define ARGLIST \ @@ -89,5 +89,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp index 2ccd050f18..344341205b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp @@ -127,11 +127,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" "cbnz x10, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -143,19 +143,19 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z0.s }, p1/Z, [x28]\n" "ble 10f\n" "9:" // Height 1: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "addvl x12, x12, #1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "addvl x12, x12, #1\n" "bne 6b\n" "tbz %x[flags], #1, 11f\n" @@ -189,9 +189,9 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "15:" // Height 2: no bias "tbz %x[flags], #0, 16f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" + "add x20, x11, x20, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "ld1w { z25.s }, p0/Z, [x27]\n" + "ld1w { z25.s }, p0/Z, [x20]\n" "b 17f\n" "16:" // Height 2: no accumulate "mov z24.b, #0x0\n" @@ -201,12 +201,12 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" "cbnz x10, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -214,30 +214,30 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 20f\n" "19:" // Height 2: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" "20:" // Height 2: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "ld1rw { z1.s }, p1/Z, [x27]\n" "ble 22f\n" "21:" // Height 2: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "ld1rw { z1.s }, p1/Z, [x27]\n" "bgt 21b\n" "22:" // Height 2: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" "bne 18b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" @@ -277,11 +277,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "27:" // Height 3: no bias "tbz %x[flags], #0, 28f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x21, x11, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" + "ld1w { z25.s }, p0/Z, [x21]\n" + "ld1w { z26.s }, p0/Z, [x20]\n" "b 29f\n" "28:" // Height 3: no accumulate "mov z24.b, #0x0\n" @@ -292,13 +292,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" "cbnz x10, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -307,8 +307,8 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 32f\n" "31:" // Height 3: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" "32:" // Height 3: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -316,14 +316,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z2.s }, p1/Z, [x26]\n" "ble 34f\n" "33:" // Height 3: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "addvl x12, x12, #1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "ld1rw { z1.s }, p1/Z, [x27]\n" @@ -331,13 +331,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 33b\n" "34:" // Height 3: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -381,13 +381,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "39:" // Height 4: no bias "tbz %x[flags], #0, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x22, x11, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" - "ld1w { z27.s }, p0/Z, [x25]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z25.s }, p0/Z, [x22]\n" + "ld1w { z26.s }, p0/Z, [x21]\n" + "ld1w { z27.s }, p0/Z, [x20]\n" "b 41f\n" "40:" // Height 4: no accumulate "mov z24.b, #0x0\n" @@ -399,14 +399,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "42:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 43f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" "cbnz x10, 44f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -416,9 +416,9 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 44f\n" "43:" // Height 4: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "44:" // Height 4: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -427,16 +427,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z3.s }, p1/Z, [x25]\n" "ble 46f\n" "45:" // Height 4: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "add x25, x25, #0x4\n" - "fmla z27.s, p1/M, z8.s, z3.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" "addvl x12, x12, #1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "ld1rw { z1.s }, p1/Z, [x27]\n" @@ -445,14 +445,14 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 45b\n" "46:" // Height 4: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" - "fmla z27.s, p1/M, z9.s, z3.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" "bne 42b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -501,15 +501,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "51:" // Height 5: no bias "tbz %x[flags], #0, 52f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x23, x11, x20, LSL #2\n" + "add x22, x23, x20, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" - "ld1w { z27.s }, p0/Z, [x25]\n" - "ld1w { z28.s }, p0/Z, [x24]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z25.s }, p0/Z, [x23]\n" + "ld1w { z26.s }, p0/Z, [x22]\n" + "ld1w { z27.s }, p0/Z, [x21]\n" + "ld1w { z28.s }, p0/Z, [x20]\n" "b 53f\n" "52:" // Height 5: no accumulate "mov z24.b, #0x0\n" @@ -522,15 +522,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "54:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 55f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" "cbnz x10, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -541,10 +541,10 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 56f\n" "55:" // Height 5: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "56:" // Height 5: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -554,20 +554,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z4.s }, p1/Z, [x24]\n" "ble 58f\n" "57:" // Height 5: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "add x25, x25, #0x4\n" "add x24, x24, #0x4\n" - "fmla z27.s, p1/M, z8.s, z3.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "addvl x12, x12, #1\n" - "fmla z28.s, p1/M, z8.s, z4.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" "ld1rw { z1.s }, p1/Z, [x27]\n" "ld1rw { z2.s }, p1/Z, [x26]\n" "ld1rw { z3.s }, p1/Z, [x25]\n" @@ -575,15 +575,15 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 57b\n" "58:" // Height 5: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" - "fmla z27.s, p1/M, z9.s, z3.s\n" - "fmla z28.s, p1/M, z9.s, z4.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" "bne 54b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -636,18 +636,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 65f\n" "63:" // Height 6: no bias "tbz %x[flags], #0, 64f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x11, x24, LSL #2\n" + "add x20, x23, x24, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z27.s }, p0/Z, [x25]\n" - "ld1w { z28.s }, p0/Z, [x24]\n" - "ld1w { z29.s }, p0/Z, [x23]\n" + "add x22, x20, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" + "ld1w { z25.s }, p0/Z, [x23]\n" + "ld1w { z26.s }, p0/Z, [x20]\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z27.s }, p0/Z, [x22]\n" + "ld1w { z28.s }, p0/Z, [x21]\n" + "ld1w { z29.s }, p0/Z, [x20]\n" "b 65f\n" "64:" // Height 6: no accumulate "mov z24.b, #0x0\n" @@ -661,16 +661,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "66:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 67f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" "cbnz x10, 68f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -682,11 +682,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 68f\n" "67:" // Height 6: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "68:" // Height 6: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -697,21 +697,21 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z5.s }, p1/Z, [x23]\n" "ble 70f\n" "69:" // Height 6: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "add x25, x25, #0x4\n" "add x24, x24, #0x4\n" - "fmla z27.s, p1/M, z8.s, z3.s\n" - "fmla z28.s, p1/M, z8.s, z4.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" "add x23, x23, #0x4\n" "addvl x12, x12, #1\n" - "fmla z29.s, p1/M, z8.s, z5.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "ld1rw { z1.s }, p1/Z, [x27]\n" "ld1rw { z2.s }, p1/Z, [x26]\n" @@ -721,16 +721,16 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 69b\n" "70:" // Height 6: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" - "fmla z27.s, p1/M, z9.s, z3.s\n" - "fmla z28.s, p1/M, z9.s, z4.s\n" - "fmla z29.s, p1/M, z9.s, z5.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" "bne 66b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -788,20 +788,20 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 77f\n" "75:" // Height 7: no bias "tbz %x[flags], #0, 76f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x21, x11, x24, LSL #2\n" + "add x20, x21, x24, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z27.s }, p0/Z, [x25]\n" - "ld1w { z28.s }, p0/Z, [x24]\n" - "ld1w { z29.s }, p0/Z, [x23]\n" - "ld1w { z30.s }, p0/Z, [x22]\n" + "add x23, x20, x24, LSL #2\n" + "add x22, x23, x24, LSL #2\n" + "ld1w { z25.s }, p0/Z, [x21]\n" + "ld1w { z26.s }, p0/Z, [x20]\n" + "add x21, x22, x24, LSL #2\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z27.s }, p0/Z, [x23]\n" + "ld1w { z28.s }, p0/Z, [x22]\n" + "ld1w { z29.s }, p0/Z, [x21]\n" + "ld1w { z30.s }, p0/Z, [x20]\n" "b 77f\n" "76:" // Height 7: no accumulate "mov z24.b, #0x0\n" @@ -816,17 +816,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "78:" // Height 7: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 79f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" "cbnz x10, 80f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -839,12 +839,12 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 80f\n" "79:" // Height 7: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "80:" // Height 7: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -856,25 +856,25 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z6.s }, p1/Z, [x22]\n" "ble 82f\n" "81:" // Height 7: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "add x25, x25, #0x4\n" "add x24, x24, #0x4\n" - "fmla z27.s, p1/M, z8.s, z3.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "add x23, x23, #0x4\n" "add x22, x22, #0x4\n" - "fmla z28.s, p1/M, z8.s, z4.s\n" - "fmla z29.s, p1/M, z8.s, z5.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" "addvl x12, x12, #1\n" "ld1rw { z1.s }, p1/Z, [x27]\n" - "fmla z30.s, p1/M, z8.s, z6.s\n" + "fmla z30.s, p1/M, z16.s, z6.s\n" "ld1rw { z2.s }, p1/Z, [x26]\n" "ld1rw { z3.s }, p1/Z, [x25]\n" "ld1rw { z4.s }, p1/Z, [x24]\n" @@ -883,17 +883,17 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 81b\n" "82:" // Height 7: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" - "fmla z27.s, p1/M, z9.s, z3.s\n" - "fmla z28.s, p1/M, z9.s, z4.s\n" - "fmla z29.s, p1/M, z9.s, z5.s\n" - "fmla z30.s, p1/M, z9.s, z6.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" + "fmla z30.s, p1/M, z16.s, z6.s\n" "bne 78b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -959,22 +959,22 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 89f\n" "87:" // Height 8: no bias "tbz %x[flags], #0, 88f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x11, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" "ld1w { z24.s }, p0/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p0/Z, [x27]\n" - "ld1w { z26.s }, p0/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z27.s }, p0/Z, [x25]\n" - "ld1w { z28.s }, p0/Z, [x24]\n" - "add x21, x22, x20, LSL #2\n" - "ld1w { z29.s }, p0/Z, [x23]\n" - "ld1w { z30.s }, p0/Z, [x22]\n" - "ld1w { z31.s }, p0/Z, [x21]\n" + "add x23, x21, x24, LSL #2\n" + "add x20, x23, x24, LSL #2\n" + "ld1w { z25.s }, p0/Z, [x22]\n" + "ld1w { z26.s }, p0/Z, [x21]\n" + "add x22, x20, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" + "ld1w { z27.s }, p0/Z, [x23]\n" + "ld1w { z28.s }, p0/Z, [x20]\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z29.s }, p0/Z, [x22]\n" + "ld1w { z30.s }, p0/Z, [x21]\n" + "ld1w { z31.s }, p0/Z, [x20]\n" "b 89f\n" "88:" // Height 8: no accumulate "mov z24.b, #0x0\n" @@ -990,18 +990,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "90:" // Height 8: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 91f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" - "ldr x21, [x21, #0x38]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x21, [x20, #0x38]\n" "cbnz x10, 92f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1015,13 +1015,13 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "b 92f\n" "91:" // Height 8: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "92:" // Height 8: input setup done "subs x9, x9, #0x1\n" "ld1rw { z0.s }, p1/Z, [x28]\n" @@ -1034,27 +1034,27 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "ld1rw { z7.s }, p1/Z, [x21]\n" "ble 94f\n" "93:" // Height 8: Multiply loop: Main loop - "ld1w { z8.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x28, x28, #0x4\n" "subs x9, x9, #0x1\n" - "fmla z24.s, p1/M, z8.s, z0.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" "add x27, x27, #0x4\n" "add x26, x26, #0x4\n" - "fmla z25.s, p1/M, z8.s, z1.s\n" - "fmla z26.s, p1/M, z8.s, z2.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" "add x25, x25, #0x4\n" "add x24, x24, #0x4\n" - "fmla z27.s, p1/M, z8.s, z3.s\n" - "fmla z28.s, p1/M, z8.s, z4.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" "add x23, x23, #0x4\n" "add x22, x22, #0x4\n" - "fmla z29.s, p1/M, z8.s, z5.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" "ld1rw { z0.s }, p1/Z, [x28]\n" "add x21, x21, #0x4\n" "addvl x12, x12, #1\n" "ld1rw { z1.s }, p1/Z, [x27]\n" - "fmla z30.s, p1/M, z8.s, z6.s\n" - "fmla z31.s, p1/M, z8.s, z7.s\n" + "fmla z30.s, p1/M, z16.s, z6.s\n" + "fmla z31.s, p1/M, z16.s, z7.s\n" "ld1rw { z2.s }, p1/Z, [x26]\n" "ld1rw { z3.s }, p1/Z, [x25]\n" "ld1rw { z4.s }, p1/Z, [x24]\n" @@ -1064,18 +1064,18 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "bgt 93b\n" "94:" // Height 8: Multiply loop: Main loop skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "ld1w { z9.s }, p1/Z, [x12]\n" + "ld1w { z16.s }, p1/Z, [x12]\n" "add x10, x10, #0x1\n" "cmp x10, x20\n" - "fmla z24.s, p1/M, z9.s, z0.s\n" - "fmla z25.s, p1/M, z9.s, z1.s\n" + "fmla z24.s, p1/M, z16.s, z0.s\n" + "fmla z25.s, p1/M, z16.s, z1.s\n" "addvl x12, x12, #1\n" - "fmla z26.s, p1/M, z9.s, z2.s\n" - "fmla z27.s, p1/M, z9.s, z3.s\n" - "fmla z28.s, p1/M, z9.s, z4.s\n" - "fmla z29.s, p1/M, z9.s, z5.s\n" - "fmla z30.s, p1/M, z9.s, z6.s\n" - "fmla z31.s, p1/M, z9.s, z7.s\n" + "fmla z26.s, p1/M, z16.s, z2.s\n" + "fmla z27.s, p1/M, z16.s, z3.s\n" + "fmla z28.s, p1/M, z16.s, z4.s\n" + "fmla z29.s, p1/M, z16.s, z5.s\n" + "fmla z30.s, p1/M, z16.s, z6.s\n" + "fmla z31.s, p1/M, z16.s, z7.s\n" "bne 90b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x27, x11, x20, LSL #2\n" @@ -1132,12 +1132,11 @@ void sve_hybrid_fp32_mla_8x1VL_a64fx ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "98:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp index 9679d49506..161c85e5f3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp @@ -127,11 +127,11 @@ void sve_hybrid_fp32_mla_8x1VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" "cbnz x10, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -144,39 +144,39 @@ void sve_hybrid_fp32_mla_8x1VL ( "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p2/Z, [x12, #2, MUL VL]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" "sub x9, x9, #0x4\n" "cmp x9, #0x4\n" - "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z24.s, z16.s, z0.s[3]\n" "add x28, x28, #0x10\n" "addvl x12, x12, #4\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z8.s, z0.s[0]\n" + "fmla z24.s, z16.s, z0.s[0]\n" "addvl x12, x12, #1\n" "ble 11f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" "addvl x12, x12, #1\n" "ble 11f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" "addvl x12, x12, #1\n" "ble 11f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" "addvl x12, x12, #1\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -214,9 +214,9 @@ void sve_hybrid_fp32_mla_8x1VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" + "add x20, x11, x20, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "ld1w { z25.s }, p1/Z, [x27]\n" + "ld1w { z25.s }, p1/Z, [x20]\n" "b 18f\n" "17:" // Height 2: no accumulate "mov z24.b, #0x0\n" @@ -226,12 +226,12 @@ void sve_hybrid_fp32_mla_8x1VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" "cbnz x10, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -239,29 +239,29 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" "21:" // Height 2: input setup done "cmp x9, #0x4\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z1.s }, p0/Z, [x28]\n" + "ld1rqw { z0.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z1.s[0]\n" + "fmla z25.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z24.s, z16.s, z1.s[1]\n" + "fmla z25.s, z16.s, z0.s[1]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" + "fmla z24.s, z17.s, z1.s[2]\n" + "fmla z25.s, z17.s, z0.s[2]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "fmla z24.s, z16.s, z1.s[3]\n" + "fmla z25.s, z16.s, z0.s[3]\n" "add x27, x27, #0x10\n" "addvl x12, x12, #4\n" "bgt 22b\n" @@ -270,26 +270,26 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z0.s }, p0/Z, [x28]\n" "ld1rqw { z1.s }, p0/Z, [x27]\n" "subs x9, x9, #0x1\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" "addvl x12, x12, #1\n" "ble 24f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" "addvl x12, x12, #1\n" "ble 24f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" "addvl x12, x12, #1\n" "ble 24f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -333,11 +333,11 @@ void sve_hybrid_fp32_mla_8x1VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x21, x11, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" + "ld1w { z25.s }, p1/Z, [x21]\n" + "ld1w { z26.s }, p1/Z, [x20]\n" "b 31f\n" "30:" // Height 3: no accumulate "mov z24.b, #0x0\n" @@ -348,13 +348,13 @@ void sve_hybrid_fp32_mla_8x1VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" "cbnz x10, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -363,38 +363,38 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" "34:" // Height 3: input setup done "cmp x9, #0x4\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" + "ld1rqw { z2.s }, p0/Z, [x28]\n" "ld1rqw { z1.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "ld1rqw { z0.s }, p0/Z, [x26]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z2.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z0.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z24.s, z16.s, z2.s[1]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z0.s[1]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" "cmp x9, #0x4\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" + "fmla z24.s, z17.s, z2.s[2]\n" + "fmla z25.s, z17.s, z1.s[2]\n" "add x28, x28, #0x10\n" "add x27, x27, #0x10\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" + "fmla z26.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z2.s[3]\n" "add x26, x26, #0x10\n" "addvl x12, x12, #4\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" + "fmla z26.s, z16.s, z0.s[3]\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -402,31 +402,31 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z1.s }, p0/Z, [x27]\n" "subs x9, x9, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" "addvl x12, x12, #1\n" "ble 37f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" "addvl x12, x12, #1\n" "ble 37f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" "addvl x12, x12, #1\n" "ble 37f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -474,13 +474,13 @@ void sve_hybrid_fp32_mla_8x1VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x22, x11, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" - "ld1w { z27.s }, p1/Z, [x25]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x22]\n" + "ld1w { z26.s }, p1/Z, [x21]\n" + "ld1w { z27.s }, p1/Z, [x20]\n" "b 44f\n" "43:" // Height 4: no accumulate "mov z24.b, #0x0\n" @@ -492,14 +492,14 @@ void sve_hybrid_fp32_mla_8x1VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" "cbnz x10, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -509,45 +509,45 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "47:" // Height 4: input setup done "cmp x9, #0x4\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z3.s }, p0/Z, [x28]\n" + "ld1rqw { z2.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1rqw { z3.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x26]\n" + "ld1rqw { z0.s }, p0/Z, [x25]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z3.s[0]\n" + "fmla z25.s, z16.s, z2.s[0]\n" + "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z26.s, z16.s, z1.s[0]\n" + "fmla z27.s, z16.s, z0.s[0]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" + "fmla z24.s, z18.s, z3.s[1]\n" + "fmla z25.s, z18.s, z2.s[1]\n" "add x27, x27, #0x10\n" "add x26, x26, #0x10\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z26.s, z18.s, z1.s[1]\n" + "fmla z27.s, z18.s, z0.s[1]\n" "add x25, x25, #0x10\n" "addvl x12, x12, #4\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z24.s, z17.s, z3.s[2]\n" + "fmla z25.s, z17.s, z2.s[2]\n" + "fmla z26.s, z17.s, z1.s[2]\n" + "fmla z27.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z3.s[3]\n" + "fmla z25.s, z16.s, z2.s[3]\n" + "fmla z26.s, z16.s, z1.s[3]\n" + "fmla z27.s, z16.s, z0.s[3]\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -556,35 +556,35 @@ void sve_hybrid_fp32_mla_8x1VL ( "subs x9, x9, #0x1\n" "ld1rqw { z2.s }, p0/Z, [x26]\n" "ld1rqw { z3.s }, p0/Z, [x25]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" "ble 50f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" + "fmla z27.s, z16.s, z3.s[1]\n" "addvl x12, x12, #1\n" "ble 50f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z27.s, z16.s, z3.s[2]\n" "addvl x12, x12, #1\n" "ble 50f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -637,15 +637,15 @@ void sve_hybrid_fp32_mla_8x1VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "add x23, x11, x20, LSL #2\n" + "add x22, x23, x20, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" - "ld1w { z27.s }, p1/Z, [x25]\n" - "ld1w { z28.s }, p1/Z, [x24]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x23]\n" + "ld1w { z26.s }, p1/Z, [x22]\n" + "ld1w { z27.s }, p1/Z, [x21]\n" + "ld1w { z28.s }, p1/Z, [x20]\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z24.b, #0x0\n" @@ -658,15 +658,15 @@ void sve_hybrid_fp32_mla_8x1VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" "cbnz x10, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -677,52 +677,52 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "60:" // Height 5: input setup done "cmp x9, #0x4\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z4.s }, p0/Z, [x28]\n" + "ld1rqw { z3.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1rqw { z3.s }, p0/Z, [x25]\n" + "ld1rqw { z1.s }, p0/Z, [x25]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x24]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "ld1rqw { z0.s }, p0/Z, [x24]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z4.s[0]\n" + "fmla z25.s, z16.s, z3.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z1.s[0]\n" + "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z24.s, z18.s, z4.s[1]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" "add x27, x27, #0x10\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z25.s, z18.s, z3.s[1]\n" + "fmla z26.s, z18.s, z2.s[1]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "fmla z27.s, z9.s, z3.s[1]\n" - "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z27.s, z18.s, z1.s[1]\n" + "fmla z28.s, z18.s, z0.s[1]\n" "add x24, x24, #0x10\n" "addvl x12, x12, #4\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z24.s, z17.s, z4.s[2]\n" + "fmla z25.s, z17.s, z3.s[2]\n" + "fmla z26.s, z17.s, z2.s[2]\n" + "fmla z27.s, z17.s, z1.s[2]\n" + "fmla z28.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z4.s[3]\n" + "fmla z25.s, z16.s, z3.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z1.s[3]\n" + "fmla z28.s, z16.s, z0.s[3]\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -732,39 +732,39 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z2.s }, p0/Z, [x26]\n" "ld1rqw { z3.s }, p0/Z, [x25]\n" "ld1rqw { z4.s }, p0/Z, [x24]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z8.s, z4.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" "ble 63f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" + "fmla z27.s, z16.s, z3.s[1]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z9.s, z4.s[1]\n" + "fmla z28.s, z16.s, z4.s[1]\n" "ble 63f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z27.s, z16.s, z3.s[2]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z10.s, z4.s[2]\n" + "fmla z28.s, z16.s, z4.s[2]\n" "ble 63f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z4.s[3]\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -821,18 +821,18 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 70f\n" "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x23, x11, x24, LSL #2\n" + "add x20, x23, x24, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z27.s }, p1/Z, [x25]\n" - "ld1w { z28.s }, p1/Z, [x24]\n" - "ld1w { z29.s }, p1/Z, [x23]\n" + "add x22, x20, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x23]\n" + "ld1w { z26.s }, p1/Z, [x20]\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x22]\n" + "ld1w { z28.s }, p1/Z, [x21]\n" + "ld1w { z29.s }, p1/Z, [x20]\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z24.b, #0x0\n" @@ -846,16 +846,16 @@ void sve_hybrid_fp32_mla_8x1VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" "cbnz x10, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -867,59 +867,59 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "73:" // Height 6: input setup done "cmp x9, #0x4\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z5.s }, p0/Z, [x28]\n" + "ld1rqw { z4.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1rqw { z3.s }, p0/Z, [x25]\n" + "ld1rqw { z3.s }, p0/Z, [x26]\n" + "ld1rqw { z2.s }, p0/Z, [x25]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x24]\n" - "ld1rqw { z5.s }, p0/Z, [x23]\n" + "ld1rqw { z1.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" "add x27, x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" + "ld1w { z19.s }, p2/Z, [x12]\n" + "fmla z24.s, z19.s, z5.s[0]\n" + "fmla z25.s, z19.s, z4.s[0]\n" + "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z26.s, z19.s, z3.s[0]\n" + "fmla z27.s, z19.s, z2.s[0]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" + "fmla z28.s, z19.s, z1.s[0]\n" + "fmla z29.s, z19.s, z0.s[0]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" + "fmla z24.s, z18.s, z5.s[1]\n" + "fmla z25.s, z18.s, z4.s[1]\n" "add x23, x23, #0x10\n" "addvl x12, x12, #4\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z26.s, z18.s, z3.s[1]\n" + "fmla z27.s, z18.s, z2.s[1]\n" + "fmla z28.s, z18.s, z1.s[1]\n" + "fmla z29.s, z18.s, z0.s[1]\n" + "fmla z24.s, z17.s, z5.s[2]\n" + "fmla z25.s, z17.s, z4.s[2]\n" + "fmla z26.s, z17.s, z3.s[2]\n" + "fmla z27.s, z17.s, z2.s[2]\n" + "fmla z28.s, z17.s, z1.s[2]\n" + "fmla z29.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z5.s[3]\n" + "fmla z25.s, z16.s, z4.s[3]\n" + "fmla z26.s, z16.s, z3.s[3]\n" + "fmla z27.s, z16.s, z2.s[3]\n" + "fmla z28.s, z16.s, z1.s[3]\n" + "fmla z29.s, z16.s, z0.s[3]\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -930,43 +930,43 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z3.s }, p0/Z, [x25]\n" "ld1rqw { z4.s }, p0/Z, [x24]\n" "ld1rqw { z5.s }, p0/Z, [x23]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" "ble 76f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" + "fmla z27.s, z16.s, z3.s[1]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" + "fmla z28.s, z16.s, z4.s[1]\n" + "fmla z29.s, z16.s, z5.s[1]\n" "ble 76f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z27.s, z16.s, z3.s[2]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" + "fmla z28.s, z16.s, z4.s[2]\n" + "fmla z29.s, z16.s, z5.s[2]\n" "ble 76f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z4.s[3]\n" + "fmla z29.s, z16.s, z5.s[3]\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -1028,20 +1028,20 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 83f\n" "81:" // Height 7: no bias "tbz %x[flags], #0, 82f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x21, x11, x24, LSL #2\n" + "add x20, x21, x24, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z27.s }, p1/Z, [x25]\n" - "ld1w { z28.s }, p1/Z, [x24]\n" - "ld1w { z29.s }, p1/Z, [x23]\n" - "ld1w { z30.s }, p1/Z, [x22]\n" + "add x23, x20, x24, LSL #2\n" + "add x22, x23, x24, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x21]\n" + "ld1w { z26.s }, p1/Z, [x20]\n" + "add x21, x22, x24, LSL #2\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x23]\n" + "ld1w { z28.s }, p1/Z, [x22]\n" + "ld1w { z29.s }, p1/Z, [x21]\n" + "ld1w { z30.s }, p1/Z, [x20]\n" "b 83f\n" "82:" // Height 7: no accumulate "mov z24.b, #0x0\n" @@ -1056,17 +1056,17 @@ void sve_hybrid_fp32_mla_8x1VL ( "84:" // Height 7: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 85f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" "cbnz x10, 86f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1079,66 +1079,66 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 86f\n" "85:" // Height 7: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "86:" // Height 7: input setup done "cmp x9, #0x4\n" "ble 88f\n" "87:" // Height 7: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z6.s }, p0/Z, [x28]\n" + "ld1rqw { z5.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" + "ld1rqw { z4.s }, p0/Z, [x26]\n" "ld1rqw { z3.s }, p0/Z, [x25]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x24]\n" - "ld1rqw { z5.s }, p0/Z, [x23]\n" + "ld1rqw { z2.s }, p0/Z, [x24]\n" + "ld1rqw { z1.s }, p0/Z, [x23]\n" "add x27, x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" + "ld1rqw { z0.s }, p0/Z, [x22]\n" + "ld1w { z19.s }, p2/Z, [x12]\n" + "fmla z24.s, z19.s, z6.s[0]\n" + "fmla z25.s, z19.s, z5.s[0]\n" + "fmla z26.s, z19.s, z4.s[0]\n" + "fmla z27.s, z19.s, z3.s[0]\n" + "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "fmla z28.s, z19.s, z2.s[0]\n" + "fmla z29.s, z19.s, z1.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" "add x25, x25, #0x10\n" - "fmla z30.s, z8.s, z6.s[0]\n" - "fmla z24.s, z9.s, z0.s[1]\n" + "fmla z30.s, z19.s, z0.s[0]\n" + "fmla z24.s, z18.s, z6.s[1]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" + "fmla z25.s, z18.s, z5.s[1]\n" + "fmla z26.s, z18.s, z4.s[1]\n" "add x22, x22, #0x10\n" "addvl x12, x12, #4\n" - "fmla z27.s, z9.s, z3.s[1]\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" - "fmla z30.s, z9.s, z6.s[1]\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" - "fmla z30.s, z10.s, z6.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" - "fmla z30.s, z11.s, z6.s[3]\n" + "fmla z27.s, z18.s, z3.s[1]\n" + "fmla z28.s, z18.s, z2.s[1]\n" + "fmla z29.s, z18.s, z1.s[1]\n" + "fmla z30.s, z18.s, z0.s[1]\n" + "fmla z24.s, z17.s, z6.s[2]\n" + "fmla z25.s, z17.s, z5.s[2]\n" + "fmla z26.s, z17.s, z4.s[2]\n" + "fmla z27.s, z17.s, z3.s[2]\n" + "fmla z28.s, z17.s, z2.s[2]\n" + "fmla z29.s, z17.s, z1.s[2]\n" + "fmla z30.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z6.s[3]\n" + "fmla z25.s, z16.s, z5.s[3]\n" + "fmla z26.s, z16.s, z4.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z2.s[3]\n" + "fmla z29.s, z16.s, z1.s[3]\n" + "fmla z30.s, z16.s, z0.s[3]\n" "bgt 87b\n" "88:" // Height 7: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -1150,47 +1150,47 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z4.s }, p0/Z, [x24]\n" "ld1rqw { z5.s }, p0/Z, [x23]\n" "ld1rqw { z6.s }, p0/Z, [x22]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" - "fmla z30.s, z8.s, z6.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" "ble 89f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" + "fmla z27.s, z16.s, z3.s[1]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" - "fmla z30.s, z9.s, z6.s[1]\n" + "fmla z28.s, z16.s, z4.s[1]\n" + "fmla z29.s, z16.s, z5.s[1]\n" + "fmla z30.s, z16.s, z6.s[1]\n" "ble 89f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z27.s, z16.s, z3.s[2]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" - "fmla z30.s, z10.s, z6.s[2]\n" + "fmla z28.s, z16.s, z4.s[2]\n" + "fmla z29.s, z16.s, z5.s[2]\n" + "fmla z30.s, z16.s, z6.s[2]\n" "ble 89f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" - "fmla z30.s, z11.s, z6.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z4.s[3]\n" + "fmla z29.s, z16.s, z5.s[3]\n" + "fmla z30.s, z16.s, z6.s[3]\n" "89:" // Height 7: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -1260,22 +1260,22 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 96f\n" "94:" // Height 8: no bias "tbz %x[flags], #0, 95f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x27, x11, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" + "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n" + "add x22, x11, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" "ld1w { z24.s }, p1/Z, [x11]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z25.s }, p1/Z, [x27]\n" - "ld1w { z26.s }, p1/Z, [x26]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z27.s }, p1/Z, [x25]\n" - "ld1w { z28.s }, p1/Z, [x24]\n" - "add x21, x22, x20, LSL #2\n" - "ld1w { z29.s }, p1/Z, [x23]\n" - "ld1w { z30.s }, p1/Z, [x22]\n" - "ld1w { z31.s }, p1/Z, [x21]\n" + "add x23, x21, x24, LSL #2\n" + "add x20, x23, x24, LSL #2\n" + "ld1w { z25.s }, p1/Z, [x22]\n" + "ld1w { z26.s }, p1/Z, [x21]\n" + "add x22, x20, x24, LSL #2\n" + "add x21, x22, x24, LSL #2\n" + "ld1w { z27.s }, p1/Z, [x23]\n" + "ld1w { z28.s }, p1/Z, [x20]\n" + "add x20, x21, x24, LSL #2\n" + "ld1w { z29.s }, p1/Z, [x22]\n" + "ld1w { z30.s }, p1/Z, [x21]\n" + "ld1w { z31.s }, p1/Z, [x20]\n" "b 96f\n" "95:" // Height 8: no accumulate "mov z24.b, #0x0\n" @@ -1291,18 +1291,18 @@ void sve_hybrid_fp32_mla_8x1VL ( "97:" // Height 8: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w9, [x20, x10, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 98f\n" - "ldr x21, [%x[input_ptr], x10, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x28, [x21, #0x0]\n" - "ldr x27, [x21, #0x8]\n" - "ldr x26, [x21, #0x10]\n" - "ldr x25, [x21, #0x18]\n" - "ldr x24, [x21, #0x20]\n" - "ldr x23, [x21, #0x28]\n" - "ldr x22, [x21, #0x30]\n" - "ldr x21, [x21, #0x38]\n" + "ldr x20, [%x[input_ptr], x10, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x28, [x20, #0x0]\n" + "ldr x27, [x20, #0x8]\n" + "ldr x26, [x20, #0x10]\n" + "ldr x25, [x20, #0x18]\n" + "ldr x24, [x20, #0x20]\n" + "ldr x23, [x20, #0x28]\n" + "ldr x22, [x20, #0x30]\n" + "ldr x21, [x20, #0x38]\n" "cbnz x10, 99f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x28, x28, x20, LSL #2\n" @@ -1316,73 +1316,73 @@ void sve_hybrid_fp32_mla_8x1VL ( "b 99f\n" "98:" // Height 8: setup direct input "mov x28, %x[input_ptr]\n" - "add x27, x28, x20, LSL #2\n" - "add x26, x27, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x27, x28, x21, LSL #2\n" + "add x26, x27, x21, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "99:" // Height 8: input setup done "cmp x9, #0x4\n" "ble 101f\n" "100:" // Height 8: Multiply loop: Main loop head "whilelt p0.s, XZR, x9\n" - "ld1rqw { z0.s }, p0/Z, [x28]\n" - "ld1rqw { z1.s }, p0/Z, [x27]\n" + "ld1rqw { z7.s }, p0/Z, [x28]\n" + "ld1rqw { z6.s }, p0/Z, [x27]\n" "sub x9, x9, #0x4\n" - "ld1rqw { z2.s }, p0/Z, [x26]\n" - "ld1rqw { z3.s }, p0/Z, [x25]\n" + "ld1rqw { z5.s }, p0/Z, [x26]\n" + "ld1rqw { z4.s }, p0/Z, [x25]\n" "cmp x9, #0x4\n" "add x28, x28, #0x10\n" - "ld1rqw { z4.s }, p0/Z, [x24]\n" - "ld1rqw { z5.s }, p0/Z, [x23]\n" + "ld1rqw { z3.s }, p0/Z, [x24]\n" + "ld1rqw { z2.s }, p0/Z, [x23]\n" "add x27, x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqw { z6.s }, p0/Z, [x22]\n" - "ld1rqw { z7.s }, p0/Z, [x21]\n" + "ld1rqw { z1.s }, p0/Z, [x22]\n" + "ld1rqw { z0.s }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" - "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" + "ld1w { z19.s }, p2/Z, [x12]\n" + "fmla z24.s, z19.s, z7.s[0]\n" + "fmla z25.s, z19.s, z6.s[0]\n" + "ld1w { z18.s }, p2/Z, [x12, #1, MUL VL]\n" + "fmla z26.s, z19.s, z5.s[0]\n" + "fmla z27.s, z19.s, z4.s[0]\n" + "ld1w { z17.s }, p2/Z, [x12, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x12, #3, MUL VL]\n" + "fmla z28.s, z19.s, z3.s[0]\n" + "fmla z29.s, z19.s, z2.s[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "fmla z30.s, z8.s, z6.s[0]\n" - "fmla z31.s, z8.s, z7.s[0]\n" + "fmla z30.s, z19.s, z1.s[0]\n" + "fmla z31.s, z19.s, z0.s[0]\n" "add x21, x21, #0x10\n" "addvl x12, x12, #4\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" - "fmla z30.s, z9.s, z6.s[1]\n" - "fmla z31.s, z9.s, z7.s[1]\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" - "fmla z30.s, z10.s, z6.s[2]\n" - "fmla z31.s, z10.s, z7.s[2]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" - "fmla z30.s, z11.s, z6.s[3]\n" - "fmla z31.s, z11.s, z7.s[3]\n" + "fmla z24.s, z18.s, z7.s[1]\n" + "fmla z25.s, z18.s, z6.s[1]\n" + "fmla z26.s, z18.s, z5.s[1]\n" + "fmla z27.s, z18.s, z4.s[1]\n" + "fmla z28.s, z18.s, z3.s[1]\n" + "fmla z29.s, z18.s, z2.s[1]\n" + "fmla z30.s, z18.s, z1.s[1]\n" + "fmla z31.s, z18.s, z0.s[1]\n" + "fmla z24.s, z17.s, z7.s[2]\n" + "fmla z25.s, z17.s, z6.s[2]\n" + "fmla z26.s, z17.s, z5.s[2]\n" + "fmla z27.s, z17.s, z4.s[2]\n" + "fmla z28.s, z17.s, z3.s[2]\n" + "fmla z29.s, z17.s, z2.s[2]\n" + "fmla z30.s, z17.s, z1.s[2]\n" + "fmla z31.s, z17.s, z0.s[2]\n" + "fmla z24.s, z16.s, z7.s[3]\n" + "fmla z25.s, z16.s, z6.s[3]\n" + "fmla z26.s, z16.s, z5.s[3]\n" + "fmla z27.s, z16.s, z4.s[3]\n" + "fmla z28.s, z16.s, z3.s[3]\n" + "fmla z29.s, z16.s, z2.s[3]\n" + "fmla z30.s, z16.s, z1.s[3]\n" + "fmla z31.s, z16.s, z0.s[3]\n" "bgt 100b\n" "101:" // Height 8: Multiply loop: Single iteration only "whilelt p0.s, XZR, x9\n" @@ -1395,51 +1395,51 @@ void sve_hybrid_fp32_mla_8x1VL ( "ld1rqw { z5.s }, p0/Z, [x23]\n" "ld1rqw { z6.s }, p0/Z, [x22]\n" "ld1rqw { z7.s }, p0/Z, [x21]\n" - "ld1w { z8.s }, p2/Z, [x12]\n" - "fmla z24.s, z8.s, z0.s[0]\n" - "fmla z25.s, z8.s, z1.s[0]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[0]\n" + "fmla z25.s, z16.s, z1.s[0]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z8.s, z2.s[0]\n" - "fmla z27.s, z8.s, z3.s[0]\n" - "fmla z28.s, z8.s, z4.s[0]\n" - "fmla z29.s, z8.s, z5.s[0]\n" - "fmla z30.s, z8.s, z6.s[0]\n" - "fmla z31.s, z8.s, z7.s[0]\n" + "fmla z26.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z3.s[0]\n" + "fmla z28.s, z16.s, z4.s[0]\n" + "fmla z29.s, z16.s, z5.s[0]\n" + "fmla z30.s, z16.s, z6.s[0]\n" + "fmla z31.s, z16.s, z7.s[0]\n" "ble 102f\n" - "ld1w { z9.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z9.s, z0.s[1]\n" - "fmla z25.s, z9.s, z1.s[1]\n" - "fmla z26.s, z9.s, z2.s[1]\n" - "fmla z27.s, z9.s, z3.s[1]\n" + "fmla z24.s, z16.s, z0.s[1]\n" + "fmla z25.s, z16.s, z1.s[1]\n" + "fmla z26.s, z16.s, z2.s[1]\n" + "fmla z27.s, z16.s, z3.s[1]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z9.s, z4.s[1]\n" - "fmla z29.s, z9.s, z5.s[1]\n" - "fmla z30.s, z9.s, z6.s[1]\n" - "fmla z31.s, z9.s, z7.s[1]\n" + "fmla z28.s, z16.s, z4.s[1]\n" + "fmla z29.s, z16.s, z5.s[1]\n" + "fmla z30.s, z16.s, z6.s[1]\n" + "fmla z31.s, z16.s, z7.s[1]\n" "ble 102f\n" - "ld1w { z10.s }, p2/Z, [x12]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" "subs x9, x9, #0x1\n" - "fmla z24.s, z10.s, z0.s[2]\n" - "fmla z25.s, z10.s, z1.s[2]\n" - "fmla z26.s, z10.s, z2.s[2]\n" - "fmla z27.s, z10.s, z3.s[2]\n" + "fmla z24.s, z16.s, z0.s[2]\n" + "fmla z25.s, z16.s, z1.s[2]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z27.s, z16.s, z3.s[2]\n" "addvl x12, x12, #1\n" - "fmla z28.s, z10.s, z4.s[2]\n" - "fmla z29.s, z10.s, z5.s[2]\n" - "fmla z30.s, z10.s, z6.s[2]\n" - "fmla z31.s, z10.s, z7.s[2]\n" + "fmla z28.s, z16.s, z4.s[2]\n" + "fmla z29.s, z16.s, z5.s[2]\n" + "fmla z30.s, z16.s, z6.s[2]\n" + "fmla z31.s, z16.s, z7.s[2]\n" "ble 102f\n" - "ld1w { z11.s }, p2/Z, [x12]\n" - "fmla z24.s, z11.s, z0.s[3]\n" - "fmla z25.s, z11.s, z1.s[3]\n" + "ld1w { z16.s }, p2/Z, [x12]\n" + "fmla z24.s, z16.s, z0.s[3]\n" + "fmla z25.s, z16.s, z1.s[3]\n" "addvl x12, x12, #1\n" - "fmla z26.s, z11.s, z2.s[3]\n" - "fmla z27.s, z11.s, z3.s[3]\n" - "fmla z28.s, z11.s, z4.s[3]\n" - "fmla z29.s, z11.s, z5.s[3]\n" - "fmla z30.s, z11.s, z6.s[3]\n" - "fmla z31.s, z11.s, z7.s[3]\n" + "fmla z26.s, z16.s, z2.s[3]\n" + "fmla z27.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z4.s[3]\n" + "fmla z29.s, z16.s, z5.s[3]\n" + "fmla z30.s, z16.s, z6.s[3]\n" + "fmla z31.s, z16.s, z7.s[3]\n" "102:" // Height 8: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x10, x10, #0x1\n" @@ -1500,12 +1500,11 @@ void sve_hybrid_fp32_mla_8x1VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "106:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp index ab175a3758..66c106d2eb 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -75,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -100,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp index 8d05c1ffb3..2b2a0684f9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp @@ -140,22 +140,22 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "b 5f\n" "3:" // Height 1: no bias "tbz %x[flags], #0, 4f\n" - "ld1w { z9.s }, p6/Z, [x27]\n" - "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x27]\n" + "ld1w { z20.s }, p5/Z, [x27, #1, MUL VL]\n" + "zip1 z8.d, z21.d, z14.d\n" + "zip2 z14.d, z21.d, z14.d\n" + "ld1w { z23.s }, p4/Z, [x27, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n" + "zip1 z9.d, z20.d, z15.d\n" + "zip2 z15.d, z20.d, z15.d\n" + "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" + "zip1 z10.d, z23.d, z16.d\n" + "zip2 z16.d, z23.d, z16.d\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "zip1 z12.d, z21.d, z18.d\n" + "zip2 z18.d, z21.d, z18.d\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" "b 5f\n" @@ -177,11 +177,11 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -193,69 +193,69 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "ble 10f\n" "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z21.h }, p7/Z, [x28]\n" + "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6475e708 // bfmmla z8.s, z24.h, z21.h\n" + ".inst 0x6474e70e // bfmmla z14.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6475e70a // bfmmla z10.s, z24.h, z21.h\n" + ".inst 0x6474e710 // bfmmla z16.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n" + "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n" "sub x25, x25, #0x4\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" + "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n" "cmp x25, #0x4\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" "add x24, x24, #0x10\n" "addvl x28, x28, #-4\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + "ld1rqw { z23.s }, p0/Z, [x24]\n" + ".inst 0x658abef7 // bfcvt z23.h, p7/M, z23.s\n" + "uzp1 z23.h, z23.h, z23.h\n" + "ld1h { z21.h }, p7/Z, [x28]\n" + "ld1h { z20.h }, p7/Z, [x28, #1, MUL VL]\n" + ".inst 0x6475e6e8 // bfmmla z8.s, z23.h, z21.h\n" + ".inst 0x6474e6ee // bfmmla z14.s, z23.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6475e6e9 // bfmmla z9.s, z23.h, z21.h\n" + ".inst 0x6474e6ef // bfmmla z15.s, z23.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #4, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6475e6ea // bfmmla z10.s, z23.h, z21.h\n" + ".inst 0x6474e6f0 // bfmmla z16.s, z23.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6475e6eb // bfmmla z11.s, z23.h, z21.h\n" + ".inst 0x6474e6f1 // bfmmla z17.s, z23.h, z20.h\n" + "ld1h { z20.h }, p7/Z, [x28, #-8, MUL VL]\n" + "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6474e6ec // bfmmla z12.s, z23.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6476e6f2 // bfmmla z18.s, z23.h, z22.h\n" + ".inst 0x6475e6ed // bfmmla z13.s, z23.h, z21.h\n" + ".inst 0x6474e6f3 // bfmmla z19.s, z23.h, z20.h\n" "addvl x28, x28, #-4\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -270,21 +270,21 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "uzp1 z13.d, z13.d, z19.d\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" + "ld1rw { z21.s }, p7/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" + "ld1rw { z20.s }, p7/Z, [x20]\n" + "fmin z8.s, p7/M, z8.s, z21.s\n" + "fmin z9.s, p7/M, z9.s, z21.s\n" + "fmin z10.s, p7/M, z10.s, z21.s\n" + "fmin z11.s, p7/M, z11.s, z21.s\n" + "fmin z12.s, p7/M, z12.s, z21.s\n" + "fmin z13.s, p7/M, z13.s, z21.s\n" + "fmax z8.s, p7/M, z8.s, z20.s\n" + "fmax z9.s, p7/M, z9.s, z20.s\n" + "fmax z10.s, p7/M, z10.s, z20.s\n" + "fmax z11.s, p7/M, z11.s, z20.s\n" + "fmax z12.s, p7/M, z12.s, z20.s\n" + "fmax z13.s, p7/M, z13.s, z20.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p6, [x27]\n" "st1w { z9.s }, p5, [x27, #1, MUL VL]\n" @@ -340,29 +340,29 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x27, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x27]\n" - "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n" + "add x20, x27, x20, LSL #2\n" + "ld1w { z16.s }, p6/Z, [x27]\n" + "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x27, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" + "ld1w { z14.s }, p6/Z, [x20]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x20, #5, MUL VL]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "zip1 z12.d, z21.d, z18.d\n" + "zip2 z18.d, z21.d, z18.d\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" "b 18f\n" @@ -384,12 +384,12 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -397,85 +397,85 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "21:" // Height 2: input setup done "cmp x25, #0x4\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + "ld1rqw { z20.s }, p0/Z, [x23]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z23.h }, p7/Z, [x28]\n" + "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n" + "uzp1 z20.h, z20.h, z20.h\n" + "trn1 z24.d, z24.d, z20.d\n" + "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n" + ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n" + "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n" + ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n" + "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n" "sub x25, x25, #0x4\n" "cmp x25, #0x4\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "addvl x28, x28, #-4\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + "ld1rqw { z24.s }, p0/Z, [x24]\n" + "ld1rqw { z20.s }, p0/Z, [x23]\n" + ".inst 0x658abf18 // bfcvt z24.h, p7/M, z24.s\n" + ".inst 0x658abe94 // bfcvt z20.h, p7/M, z20.s\n" + "uzp1 z24.h, z24.h, z24.h\n" + "ld1h { z23.h }, p7/Z, [x28]\n" + "ld1h { z22.h }, p7/Z, [x28, #1, MUL VL]\n" + "uzp1 z20.h, z20.h, z20.h\n" + "trn1 z24.d, z24.d, z20.d\n" + "ld1h { z21.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6477e708 // bfmmla z8.s, z24.h, z23.h\n" + ".inst 0x6476e70e // bfmmla z14.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28, #4, MUL VL]\n" + "ld1h { z22.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6475e709 // bfmmla z9.s, z24.h, z21.h\n" + ".inst 0x6474e70f // bfmmla z15.s, z24.h, z20.h\n" + "ld1h { z21.h }, p7/Z, [x28, #6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" + ".inst 0x6477e70a // bfmmla z10.s, z24.h, z23.h\n" + ".inst 0x6476e710 // bfmmla z16.s, z24.h, z22.h\n" + "ld1h { z23.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6475e70b // bfmmla z11.s, z24.h, z21.h\n" + ".inst 0x6474e711 // bfmmla z17.s, z24.h, z20.h\n" + "ld1h { z22.h }, p7/Z, [x28, #-7, MUL VL]\n" + "ld1h { z21.h }, p7/Z, [x28, #-6, MUL VL]\n" + "ld1h { z20.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6477e70c // bfmmla z12.s, z24.h, z23.h\n" + ".inst 0x6476e712 // bfmmla z18.s, z24.h, z22.h\n" "addvl x28, x28, #-4\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" + ".inst 0x6475e70d // bfmmla z13.s, z24.h, z21.h\n" + ".inst 0x6474e713 // bfmmla z19.s, z24.h, z20.h\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -497,33 +497,33 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "uzp2 z13.d, z13.d, z19.d\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" + "ld1rw { z20.s }, p7/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z4.s, p7/M, z4.s, z1.s\n" - "fmin z14.s, p7/M, z14.s, z1.s\n" - "fmin z15.s, p7/M, z15.s, z1.s\n" - "fmin z16.s, p7/M, z16.s, z1.s\n" - "fmin z17.s, p7/M, z17.s, z1.s\n" - "fmin z18.s, p7/M, z18.s, z1.s\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmax z4.s, p7/M, z4.s, z0.s\n" - "fmax z14.s, p7/M, z14.s, z0.s\n" - "fmax z15.s, p7/M, z15.s, z0.s\n" - "fmax z16.s, p7/M, z16.s, z0.s\n" - "fmax z17.s, p7/M, z17.s, z0.s\n" - "fmax z18.s, p7/M, z18.s, z0.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" + "ld1rw { z19.s }, p7/Z, [x20]\n" + "fmin z4.s, p7/M, z4.s, z20.s\n" + "fmin z14.s, p7/M, z14.s, z20.s\n" + "fmin z15.s, p7/M, z15.s, z20.s\n" + "fmin z16.s, p7/M, z16.s, z20.s\n" + "fmin z17.s, p7/M, z17.s, z20.s\n" + "fmin z18.s, p7/M, z18.s, z20.s\n" + "fmin z8.s, p7/M, z8.s, z20.s\n" + "fmin z9.s, p7/M, z9.s, z20.s\n" + "fmin z10.s, p7/M, z10.s, z20.s\n" + "fmin z11.s, p7/M, z11.s, z20.s\n" + "fmin z12.s, p7/M, z12.s, z20.s\n" + "fmin z13.s, p7/M, z13.s, z20.s\n" + "fmax z4.s, p7/M, z4.s, z19.s\n" + "fmax z14.s, p7/M, z14.s, z19.s\n" + "fmax z15.s, p7/M, z15.s, z19.s\n" + "fmax z16.s, p7/M, z16.s, z19.s\n" + "fmax z17.s, p7/M, z17.s, z19.s\n" + "fmax z18.s, p7/M, z18.s, z19.s\n" + "fmax z8.s, p7/M, z8.s, z19.s\n" + "fmax z9.s, p7/M, z9.s, z19.s\n" + "fmax z10.s, p7/M, z10.s, z19.s\n" + "fmax z11.s, p7/M, z11.s, z19.s\n" + "fmax z12.s, p7/M, z12.s, z19.s\n" + "fmax z13.s, p7/M, z13.s, z19.s\n" "25:" // Height 2: No activation "st1w { z4.s }, p6, [x27]\n" "st1w { z14.s }, p5, [x27, #1, MUL VL]\n" @@ -597,38 +597,38 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x27, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x27]\n" - "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n" + "add x21, x27, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z16.s }, p6/Z, [x27]\n" + "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n" + "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "ld1w { z21.s }, p6/Z, [x22]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n" - "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" - "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x21]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x21, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x21, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x20]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "ld1w { z22.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x20, #2, MUL VL]\n" + "zip1 z12.d, z24.d, z18.d\n" + "zip2 z18.d, z24.d, z18.d\n" + "ld1w { z24.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x20, #4, MUL VL]\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" - "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #5, MUL VL]\n" "zip1 z20.d, z21.d, z26.d\n" "zip2 z26.d, z21.d, z26.d\n" "zip1 z21.d, z22.d, z27.d\n" @@ -639,8 +639,8 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "zip2 z29.d, z24.d, z29.d\n" "zip1 z24.d, z25.d, z30.d\n" "zip2 z30.d, z25.d, z30.d\n" - "zip1 z25.d, z4.d, z31.d\n" - "zip2 z31.d, z4.d, z31.d\n" + "zip1 z25.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 31f\n" "30:" // Height 3: no accumulate "mov z8.b, #0x0\n" @@ -672,13 +672,13 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -687,117 +687,117 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "34:" // Height 3: input setup done "cmp x25, #0x4\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" + "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n" + "trn1 z5.d, z5.d, z0.d\n" + "uzp1 z4.h, z4.h, z4.h\n" + "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n" + ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n" + ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n" "sub x25, x25, #0x4\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n" "cmp x25, #0x4\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" + ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n" + ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n" + ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n" + ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n" "addvl x28, x28, #-4\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n" + ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z0.s }, p0/Z, [x23]\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + "ld1rqw { z4.s }, p0/Z, [x22]\n" ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" + "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n" + "trn1 z5.d, z5.d, z0.d\n" + "uzp1 z4.h, z4.h, z4.h\n" + "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n" + ".inst 0x6463e4a8 // bfmmla z8.s, z5.h, z3.h\n" + ".inst 0x6463e494 // bfmmla z20.s, z4.h, z3.h\n" + ".inst 0x6462e4ae // bfmmla z14.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n" + ".inst 0x6462e49a // bfmmla z26.s, z4.h, z2.h\n" + ".inst 0x6461e4a9 // bfmmla z9.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6461e495 // bfmmla z21.s, z4.h, z1.h\n" + ".inst 0x6460e4af // bfmmla z15.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n" + ".inst 0x6460e49b // bfmmla z27.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6463e4aa // bfmmla z10.s, z5.h, z3.h\n" + ".inst 0x6463e496 // bfmmla z22.s, z4.h, z3.h\n" + ".inst 0x6462e4b0 // bfmmla z16.s, z5.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6462e49c // bfmmla z28.s, z4.h, z2.h\n" + ".inst 0x6461e4ab // bfmmla z11.s, z5.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6461e497 // bfmmla z23.s, z4.h, z1.h\n" + ".inst 0x6460e4b1 // bfmmla z17.s, z5.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6463e4ac // bfmmla z12.s, z5.h, z3.h\n" "addvl x28, x28, #-4\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e498 // bfmmla z24.s, z4.h, z3.h\n" + ".inst 0x6462e4b2 // bfmmla z18.s, z5.h, z2.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ad // bfmmla z13.s, z5.h, z1.h\n" + ".inst 0x6461e499 // bfmmla z25.s, z4.h, z1.h\n" + ".inst 0x6460e4b3 // bfmmla z19.s, z5.h, z0.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -826,45 +826,45 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "uzp1 z25.d, z25.d, z31.d\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p7/Z, [x20]\n" - "add x20, %x[args_ptr], %[offset_min]\n" "ld1rw { z0.s }, p7/Z, [x20]\n" - "fmin z4.s, p7/M, z4.s, z1.s\n" - "fmin z14.s, p7/M, z14.s, z1.s\n" - "fmin z15.s, p7/M, z15.s, z1.s\n" - "fmin z16.s, p7/M, z16.s, z1.s\n" - "fmin z17.s, p7/M, z17.s, z1.s\n" - "fmin z18.s, p7/M, z18.s, z1.s\n" - "fmin z8.s, p7/M, z8.s, z1.s\n" - "fmin z9.s, p7/M, z9.s, z1.s\n" - "fmin z10.s, p7/M, z10.s, z1.s\n" - "fmin z11.s, p7/M, z11.s, z1.s\n" - "fmin z12.s, p7/M, z12.s, z1.s\n" - "fmin z13.s, p7/M, z13.s, z1.s\n" - "fmin z20.s, p7/M, z20.s, z1.s\n" - "fmin z21.s, p7/M, z21.s, z1.s\n" - "fmin z22.s, p7/M, z22.s, z1.s\n" - "fmin z23.s, p7/M, z23.s, z1.s\n" - "fmin z24.s, p7/M, z24.s, z1.s\n" - "fmin z25.s, p7/M, z25.s, z1.s\n" - "fmax z4.s, p7/M, z4.s, z0.s\n" - "fmax z14.s, p7/M, z14.s, z0.s\n" - "fmax z15.s, p7/M, z15.s, z0.s\n" - "fmax z16.s, p7/M, z16.s, z0.s\n" - "fmax z17.s, p7/M, z17.s, z0.s\n" - "fmax z18.s, p7/M, z18.s, z0.s\n" - "fmax z8.s, p7/M, z8.s, z0.s\n" - "fmax z9.s, p7/M, z9.s, z0.s\n" - "fmax z10.s, p7/M, z10.s, z0.s\n" - "fmax z11.s, p7/M, z11.s, z0.s\n" - "fmax z12.s, p7/M, z12.s, z0.s\n" - "fmax z13.s, p7/M, z13.s, z0.s\n" - "fmax z20.s, p7/M, z20.s, z0.s\n" - "fmax z21.s, p7/M, z21.s, z0.s\n" - "fmax z22.s, p7/M, z22.s, z0.s\n" - "fmax z23.s, p7/M, z23.s, z0.s\n" - "fmax z24.s, p7/M, z24.s, z0.s\n" - "fmax z25.s, p7/M, z25.s, z0.s\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1rw { z19.s }, p7/Z, [x20]\n" + "fmin z4.s, p7/M, z4.s, z0.s\n" + "fmin z14.s, p7/M, z14.s, z0.s\n" + "fmin z15.s, p7/M, z15.s, z0.s\n" + "fmin z16.s, p7/M, z16.s, z0.s\n" + "fmin z17.s, p7/M, z17.s, z0.s\n" + "fmin z18.s, p7/M, z18.s, z0.s\n" + "fmin z8.s, p7/M, z8.s, z0.s\n" + "fmin z9.s, p7/M, z9.s, z0.s\n" + "fmin z10.s, p7/M, z10.s, z0.s\n" + "fmin z11.s, p7/M, z11.s, z0.s\n" + "fmin z12.s, p7/M, z12.s, z0.s\n" + "fmin z13.s, p7/M, z13.s, z0.s\n" + "fmin z20.s, p7/M, z20.s, z0.s\n" + "fmin z21.s, p7/M, z21.s, z0.s\n" + "fmin z22.s, p7/M, z22.s, z0.s\n" + "fmin z23.s, p7/M, z23.s, z0.s\n" + "fmin z24.s, p7/M, z24.s, z0.s\n" + "fmin z25.s, p7/M, z25.s, z0.s\n" + "fmax z4.s, p7/M, z4.s, z19.s\n" + "fmax z14.s, p7/M, z14.s, z19.s\n" + "fmax z15.s, p7/M, z15.s, z19.s\n" + "fmax z16.s, p7/M, z16.s, z19.s\n" + "fmax z17.s, p7/M, z17.s, z19.s\n" + "fmax z18.s, p7/M, z18.s, z19.s\n" + "fmax z8.s, p7/M, z8.s, z19.s\n" + "fmax z9.s, p7/M, z9.s, z19.s\n" + "fmax z10.s, p7/M, z10.s, z19.s\n" + "fmax z11.s, p7/M, z11.s, z19.s\n" + "fmax z12.s, p7/M, z12.s, z19.s\n" + "fmax z13.s, p7/M, z13.s, z19.s\n" + "fmax z20.s, p7/M, z20.s, z19.s\n" + "fmax z21.s, p7/M, z21.s, z19.s\n" + "fmax z22.s, p7/M, z22.s, z19.s\n" + "fmax z23.s, p7/M, z23.s, z19.s\n" + "fmax z24.s, p7/M, z24.s, z19.s\n" + "fmax z25.s, p7/M, z25.s, z19.s\n" "38:" // Height 3: No activation "st1w { z4.s }, p6, [x27]\n" "st1w { z14.s }, p5, [x27, #1, MUL VL]\n" @@ -947,57 +947,57 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x23, x27, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z9.s }, p6/Z, [x27]\n" + "add x22, x27, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p5/Z, [x27, #1, MUL VL]\n" - "ld1w { z11.s }, p4/Z, [x27, #2, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x27, #3, MUL VL]\n" - "ld1w { z13.s }, p2/Z, [x27, #4, MUL VL]\n" + "ld1w { z16.s }, p6/Z, [x27]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p5/Z, [x27, #1, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x27, #2, MUL VL]\n" + "ld1w { z22.s }, p3/Z, [x27, #3, MUL VL]\n" + "ld1w { z24.s }, p2/Z, [x27, #4, MUL VL]\n" "ld1w { z20.s }, p1/Z, [x27, #5, MUL VL]\n" - "ld1w { z14.s }, p6/Z, [x23]\n" - "zip1 z8.d, z9.d, z14.d\n" - "zip2 z14.d, z9.d, z14.d\n" - "ld1w { z15.s }, p5/Z, [x23, #1, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z15.d\n" - "zip2 z15.d, z10.d, z15.d\n" - "ld1w { z17.s }, p3/Z, [x23, #3, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #4, MUL VL]\n" - "zip1 z10.d, z11.d, z16.d\n" - "zip2 z16.d, z11.d, z16.d\n" - "ld1w { z19.s }, p1/Z, [x23, #5, MUL VL]\n" - "ld1w { z21.s }, p6/Z, [x22]\n" - "zip1 z11.d, z12.d, z17.d\n" - "zip2 z17.d, z12.d, z17.d\n" - "ld1w { z22.s }, p5/Z, [x22, #1, MUL VL]\n" - "ld1w { z23.s }, p4/Z, [x22, #2, MUL VL]\n" - "zip1 z12.d, z13.d, z18.d\n" - "zip2 z18.d, z13.d, z18.d\n" - "ld1w { z24.s }, p3/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p2/Z, [x22, #4, MUL VL]\n" + "ld1w { z14.s }, p6/Z, [x22]\n" + "zip1 z8.d, z16.d, z14.d\n" + "zip2 z14.d, z16.d, z14.d\n" + "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z15.d\n" + "zip2 z15.d, z17.d, z15.d\n" + "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n" + "zip1 z10.d, z19.d, z16.d\n" + "zip2 z16.d, z19.d, z16.d\n" + "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n" + "ld1w { z21.s }, p6/Z, [x21]\n" + "zip1 z11.d, z22.d, z17.d\n" + "zip2 z17.d, z22.d, z17.d\n" + "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n" + "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n" + "zip1 z12.d, z24.d, z18.d\n" + "zip2 z18.d, z24.d, z18.d\n" + "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n" "zip1 z13.d, z20.d, z19.d\n" "zip2 z19.d, z20.d, z19.d\n" - "ld1w { z4.s }, p1/Z, [x22, #5, MUL VL]\n" - "ld1w { z26.s }, p6/Z, [x21]\n" + "ld1w { z0.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z26.s }, p6/Z, [x20]\n" "zip1 z20.d, z21.d, z26.d\n" "zip2 z26.d, z21.d, z26.d\n" - "ld1w { z27.s }, p5/Z, [x21, #1, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21, #2, MUL VL]\n" + "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n" "zip1 z21.d, z22.d, z27.d\n" "zip2 z27.d, z22.d, z27.d\n" - "ld1w { z29.s }, p3/Z, [x21, #3, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #4, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n" "zip1 z22.d, z23.d, z28.d\n" "zip2 z28.d, z23.d, z28.d\n" - "ld1w { z31.s }, p1/Z, [x21, #5, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n" "zip1 z23.d, z24.d, z29.d\n" "zip2 z29.d, z24.d, z29.d\n" "zip1 z24.d, z25.d, z30.d\n" "zip2 z30.d, z25.d, z30.d\n" - "zip1 z25.d, z4.d, z31.d\n" - "zip2 z31.d, z4.d, z31.d\n" + "zip1 z25.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 44f\n" "43:" // Height 4: no accumulate "mov z8.b, #0x0\n" @@ -1029,14 +1029,14 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20, LSL #2\n" @@ -1046,127 +1046,127 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "47:" // Height 4: input setup done "cmp x25, #0x4\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - "ld1rqw { z3.s }, p0/Z, [x21]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "ld1rqw { z7.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x23]\n" + ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n" + "uzp1 z4.h, z4.h, z4.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n" "sub x25, x25, #0x4\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" + "trn1 z5.d, z5.d, z4.d\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n" "cmp x25, #0x4\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" + ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" + ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n" + ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n" "add x21, x21, #0x10\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n" "addvl x28, x28, #-4\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n" + ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n" + ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n" + ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n" + ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n" + ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x25\n" - "ld1rqw { z0.s }, p0/Z, [x24]\n" - "ld1rqw { z1.s }, p0/Z, [x23]\n" - ".inst 0x658abc00 // bfcvt z0.h, p7/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x22]\n" - "ld1rqw { z3.s }, p0/Z, [x21]\n" - ".inst 0x658abc21 // bfcvt z1.h, p7/M, z1.s\n" - ".inst 0x658abc42 // bfcvt z2.h, p7/M, z2.s\n" - ".inst 0x658abc63 // bfcvt z3.h, p7/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z4.h }, p7/Z, [x28]\n" - "ld1h { z5.h }, p7/Z, [x28, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z6.h }, p7/Z, [x28, #2, MUL VL]\n" - "ld1h { z7.h }, p7/Z, [x28, #3, MUL VL]\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6465e40e // bfmmla z14.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #4, MUL VL]\n" - ".inst 0x6465e45a // bfmmla z26.s, z2.h, z5.h\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #5, MUL VL]\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #6, MUL VL]\n" - ".inst 0x6467e45b // bfmmla z27.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #7, MUL VL]\n" + "ld1rqw { z7.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x23]\n" + ".inst 0x658abce7 // bfcvt z7.h, p7/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x22]\n" + "ld1rqw { z4.s }, p0/Z, [x21]\n" + ".inst 0x658abcc6 // bfcvt z6.h, p7/M, z6.s\n" + ".inst 0x658abca5 // bfcvt z5.h, p7/M, z5.s\n" + ".inst 0x658abc84 // bfcvt z4.h, p7/M, z4.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z3.h }, p7/Z, [x28]\n" + "ld1h { z2.h }, p7/Z, [x28, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "uzp1 z5.h, z5.h, z5.h\n" + "ld1h { z1.h }, p7/Z, [x28, #2, MUL VL]\n" + "ld1h { z0.h }, p7/Z, [x28, #3, MUL VL]\n" + "uzp1 z4.h, z4.h, z4.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6463e4e8 // bfmmla z8.s, z7.h, z3.h\n" + "trn1 z5.d, z5.d, z4.d\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6462e4ee // bfmmla z14.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #4, MUL VL]\n" + ".inst 0x6462e4ba // bfmmla z26.s, z5.h, z2.h\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #5, MUL VL]\n" + ".inst 0x6461e4b5 // bfmmla z21.s, z5.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #6, MUL VL]\n" + ".inst 0x6460e4bb // bfmmla z27.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e410 // bfmmla z16.s, z0.h, z5.h\n" - "ld1h { z4.h }, p7/Z, [x28, #-8, MUL VL]\n" - ".inst 0x6465e45c // bfmmla z28.s, z2.h, z5.h\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - "ld1h { z5.h }, p7/Z, [x28, #-7, MUL VL]\n" - ".inst 0x6466e457 // bfmmla z23.s, z2.h, z6.h\n" - ".inst 0x6467e411 // bfmmla z17.s, z0.h, z7.h\n" - "ld1h { z6.h }, p7/Z, [x28, #-6, MUL VL]\n" - ".inst 0x6467e45d // bfmmla z29.s, z2.h, z7.h\n" - "ld1h { z7.h }, p7/Z, [x28, #-5, MUL VL]\n" - ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6463e4ea // bfmmla z10.s, z7.h, z3.h\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6462e4f0 // bfmmla z16.s, z7.h, z2.h\n" + "ld1h { z3.h }, p7/Z, [x28, #-8, MUL VL]\n" + ".inst 0x6462e4bc // bfmmla z28.s, z5.h, z2.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + "ld1h { z2.h }, p7/Z, [x28, #-7, MUL VL]\n" + ".inst 0x6461e4b7 // bfmmla z23.s, z5.h, z1.h\n" + ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n" + "ld1h { z1.h }, p7/Z, [x28, #-6, MUL VL]\n" + ".inst 0x6460e4bd // bfmmla z29.s, z5.h, z0.h\n" + "ld1h { z0.h }, p7/Z, [x28, #-5, MUL VL]\n" + ".inst 0x6463e4ec // bfmmla z12.s, z7.h, z3.h\n" "addvl x28, x28, #-4\n" - ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n" - ".inst 0x6465e412 // bfmmla z18.s, z0.h, z5.h\n" - ".inst 0x6465e45e // bfmmla z30.s, z2.h, z5.h\n" - ".inst 0x6466e40d // bfmmla z13.s, z0.h, z6.h\n" - ".inst 0x6466e459 // bfmmla z25.s, z2.h, z6.h\n" - ".inst 0x6467e413 // bfmmla z19.s, z0.h, z7.h\n" - ".inst 0x6467e45f // bfmmla z31.s, z2.h, z7.h\n" + ".inst 0x6463e4b8 // bfmmla z24.s, z5.h, z3.h\n" + ".inst 0x6462e4f2 // bfmmla z18.s, z7.h, z2.h\n" + ".inst 0x6462e4be // bfmmla z30.s, z5.h, z2.h\n" + ".inst 0x6461e4ed // bfmmla z13.s, z7.h, z1.h\n" + ".inst 0x6461e4b9 // bfmmla z25.s, z5.h, z1.h\n" + ".inst 0x6460e4f3 // bfmmla z19.s, z7.h, z0.h\n" + ".inst 0x6460e4bf // bfmmla z31.s, z5.h, z0.h\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x26, x26, #0x1\n" @@ -1295,7 +1295,6 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "54:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1303,4 +1302,4 @@ void sve_hybrid_fp32bf16fp32_mmla_4x6VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp index b7c9aca9dd..15b7dd721c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -75,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -100,5 +99,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp index 23d7ff9c3b..0d2b47ec39 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp @@ -133,16 +133,16 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 5f\n" "3:" // Height 1: no bias "tbz %x[flags], #0, 4f\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z16.d, z12.d\n" + "zip2 z12.d, z16.d, z12.d\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 5f\n" @@ -160,11 +160,11 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "6:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 7f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 8f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -176,51 +176,51 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "ble 10f\n" "9:" // Height 1: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqw { z18.s }, p0/Z, [x26]\n" + ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n" + "uzp1 z18.h, z18.h, z18.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "add x26, x26, #0x10\n" "addvl x10, x10, #8\n" "bgt 9b\n" "10:" // Height 1: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + "ld1rqw { z18.s }, p0/Z, [x26]\n" + ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n" + "uzp1 z18.h, z18.h, z18.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x6471e648 // bfmmla z8.s, z18.h, z17.h\n" + ".inst 0x6470e64c // bfmmla z12.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e649 // bfmmla z9.s, z18.h, z17.h\n" + ".inst 0x6470e64d // bfmmla z13.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e64a // bfmmla z10.s, z18.h, z17.h\n" + ".inst 0x6470e64e // bfmmla z14.s, z18.h, z16.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6471e64b // bfmmla z11.s, z18.h, z17.h\n" + ".inst 0x6470e64f // bfmmla z15.s, z18.h, z16.h\n" "addvl x10, x10, #8\n" "11:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -233,17 +233,17 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "uzp1 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 12f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "12:" // Height 1: No activation "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" @@ -287,21 +287,21 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "16:" // Height 2: no bias "tbz %x[flags], #0, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x9, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 18f\n" @@ -319,12 +319,12 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "19:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 20f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 21f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -332,67 +332,67 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 21f\n" "20:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" "21:" // Height 2: input setup done "cmp x27, #0x4\n" "ble 23f\n" "22:" // Height 2: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqw { z19.s }, p0/Z, [x26]\n" + "ld1rqw { z18.s }, p0/Z, [x25]\n" + ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n" + ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n" + "uzp1 z19.h, z19.h, z19.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z18.h, z18.h, z18.h\n" + "trn1 z19.d, z19.d, z18.d\n" + ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n" + ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "addvl x10, x10, #8\n" "bgt 22b\n" "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1rqw { z19.s }, p0/Z, [x26]\n" + "ld1rqw { z18.s }, p0/Z, [x25]\n" + ".inst 0x658ab673 // bfcvt z19.h, p5/M, z19.s\n" + ".inst 0x658ab652 // bfcvt z18.h, p5/M, z18.s\n" + "uzp1 z19.h, z19.h, z19.h\n" + "ld1h { z17.h }, p5/Z, [x10]\n" + "ld1h { z16.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z18.h, z18.h, z18.h\n" + "trn1 z19.d, z19.d, z18.d\n" + ".inst 0x6471e668 // bfmmla z8.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6470e66c // bfmmla z12.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6471e669 // bfmmla z9.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6470e66d // bfmmla z13.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6471e66a // bfmmla z10.s, z19.h, z17.h\n" + "ld1h { z17.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6470e66e // bfmmla z14.s, z19.h, z16.h\n" + "ld1h { z16.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6471e66b // bfmmla z11.s, z19.h, z17.h\n" "addvl x10, x10, #8\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" + ".inst 0x6470e66f // bfmmla z15.s, z19.h, z16.h\n" "24:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -410,25 +410,25 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "uzp2 z11.d, z11.d, z15.d\n" "tbz %x[flags], #1, 25f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z17.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z6.s, p5/M, z6.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmax z6.s, p5/M, z6.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" + "ld1rw { z16.s }, p5/Z, [x20]\n" + "fmin z6.s, p5/M, z6.s, z17.s\n" + "fmin z12.s, p5/M, z12.s, z17.s\n" + "fmin z13.s, p5/M, z13.s, z17.s\n" + "fmin z14.s, p5/M, z14.s, z17.s\n" + "fmin z8.s, p5/M, z8.s, z17.s\n" + "fmin z9.s, p5/M, z9.s, z17.s\n" + "fmin z10.s, p5/M, z10.s, z17.s\n" + "fmin z11.s, p5/M, z11.s, z17.s\n" + "fmax z6.s, p5/M, z6.s, z16.s\n" + "fmax z12.s, p5/M, z12.s, z16.s\n" + "fmax z13.s, p5/M, z13.s, z16.s\n" + "fmax z14.s, p5/M, z14.s, z16.s\n" + "fmax z8.s, p5/M, z8.s, z16.s\n" + "fmax z9.s, p5/M, z9.s, z16.s\n" + "fmax z10.s, p5/M, z10.s, z16.s\n" + "fmax z11.s, p5/M, z11.s, z16.s\n" "25:" // Height 2: No activation "st1w { z6.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -484,28 +484,28 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "29:" // Height 3: no bias "tbz %x[flags], #0, 30f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x20]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" @@ -537,13 +537,13 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -552,91 +552,91 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" "34:" // Height 3: input setup done "cmp x27, #0x4\n" "ble 36f\n" "35:" // Height 3: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "ld1rqw { z28.s }, p0/Z, [x26]\n" + "ld1rqw { z27.s }, p0/Z, [x25]\n" + ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n" + "ld1rqw { z26.s }, p0/Z, [x24]\n" + ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n" + "uzp1 z28.h, z28.h, z28.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "uzp1 z27.h, z27.h, z27.h\n" + ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x4\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "trn1 z28.d, z28.d, z27.d\n" + "uzp1 z26.h, z26.h, z26.h\n" + ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n" "cmp x27, #0x4\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n" "add x25, x25, #0x10\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "bgt 35b\n" "36:" // Height 3: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "uzp1 z1.h, z1.h, z1.h\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "trn1 z0.d, z0.d, z1.d\n" - "uzp1 z2.h, z2.h, z2.h\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1rqw { z28.s }, p0/Z, [x26]\n" + "ld1rqw { z27.s }, p0/Z, [x25]\n" + ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n" + "ld1rqw { z26.s }, p0/Z, [x24]\n" + ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n" + "uzp1 z28.h, z28.h, z28.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "uzp1 z27.h, z27.h, z27.h\n" + ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "trn1 z28.d, z28.d, z27.d\n" + "uzp1 z26.h, z26.h, z26.h\n" + ".inst 0x6479e788 // bfmmla z8.s, z28.h, z25.h\n" + ".inst 0x6479e750 // bfmmla z16.s, z26.h, z25.h\n" + ".inst 0x6478e78c // bfmmla z12.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6478e754 // bfmmla z20.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e789 // bfmmla z9.s, z28.h, z25.h\n" + ".inst 0x6479e751 // bfmmla z17.s, z26.h, z25.h\n" + ".inst 0x6478e78d // bfmmla z13.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6478e755 // bfmmla z21.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e78a // bfmmla z10.s, z28.h, z25.h\n" + ".inst 0x6479e752 // bfmmla z18.s, z26.h, z25.h\n" + ".inst 0x6478e78e // bfmmla z14.s, z28.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e756 // bfmmla z22.s, z26.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6479e78b // bfmmla z11.s, z28.h, z25.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6479e753 // bfmmla z19.s, z26.h, z25.h\n" + ".inst 0x6478e78f // bfmmla z15.s, z28.h, z24.h\n" + ".inst 0x6478e757 // bfmmla z23.s, z26.h, z24.h\n" "37:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -659,33 +659,33 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "uzp1 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 38f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z25.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z6.s, p5/M, z6.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z6.s, p5/M, z6.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" + "fmin z6.s, p5/M, z6.s, z25.s\n" + "fmin z12.s, p5/M, z12.s, z25.s\n" + "fmin z13.s, p5/M, z13.s, z25.s\n" + "fmin z14.s, p5/M, z14.s, z25.s\n" + "fmin z8.s, p5/M, z8.s, z25.s\n" + "fmin z9.s, p5/M, z9.s, z25.s\n" + "fmin z10.s, p5/M, z10.s, z25.s\n" + "fmin z11.s, p5/M, z11.s, z25.s\n" + "fmin z16.s, p5/M, z16.s, z25.s\n" + "fmin z17.s, p5/M, z17.s, z25.s\n" + "fmin z18.s, p5/M, z18.s, z25.s\n" + "fmin z19.s, p5/M, z19.s, z25.s\n" + "fmax z6.s, p5/M, z6.s, z24.s\n" + "fmax z12.s, p5/M, z12.s, z24.s\n" + "fmax z13.s, p5/M, z13.s, z24.s\n" + "fmax z14.s, p5/M, z14.s, z24.s\n" + "fmax z8.s, p5/M, z8.s, z24.s\n" + "fmax z9.s, p5/M, z9.s, z24.s\n" + "fmax z10.s, p5/M, z10.s, z24.s\n" + "fmax z11.s, p5/M, z11.s, z24.s\n" + "fmax z16.s, p5/M, z16.s, z24.s\n" + "fmax z17.s, p5/M, z17.s, z24.s\n" + "fmax z18.s, p5/M, z18.s, z24.s\n" + "fmax z19.s, p5/M, z19.s, z24.s\n" "38:" // Height 3: No activation "st1w { z6.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -745,37 +745,37 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "42:" // Height 4: no bias "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x21]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" @@ -803,14 +803,14 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "45:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -820,101 +820,101 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 47f\n" "46:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" "47:" // Height 4: input setup done "cmp x27, #0x4\n" "ble 49f\n" "48:" // Height 4: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" + "ld1rqw { z29.s }, p0/Z, [x26]\n" + "ld1rqw { z28.s }, p0/Z, [x25]\n" + ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n" + "ld1rqw { z27.s }, p0/Z, [x24]\n" + "ld1rqw { z26.s }, p0/Z, [x23]\n" + ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n" + ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n" + ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n" + "uzp1 z29.h, z29.h, z29.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z28.h, z28.h, z28.h\n" + "uzp1 z27.h, z27.h, z27.h\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "uzp1 z26.h, z26.h, z26.h\n" + "trn1 z29.d, z29.d, z28.d\n" + ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n" "add x26, x26, #0x10\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + "trn1 z27.d, z27.d, z26.d\n" + ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n" + ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n" "add x25, x25, #0x10\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n" + ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n" "add x23, x23, #0x10\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n" + ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n" + ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n" + ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n" "bgt 48b\n" "49:" // Height 4: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "uzp1 z3.h, z3.h, z3.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1rqw { z29.s }, p0/Z, [x26]\n" + "ld1rqw { z28.s }, p0/Z, [x25]\n" + ".inst 0x658ab7bd // bfcvt z29.h, p5/M, z29.s\n" + "ld1rqw { z27.s }, p0/Z, [x24]\n" + "ld1rqw { z26.s }, p0/Z, [x23]\n" + ".inst 0x658ab79c // bfcvt z28.h, p5/M, z28.s\n" + ".inst 0x658ab77b // bfcvt z27.h, p5/M, z27.s\n" + ".inst 0x658ab75a // bfcvt z26.h, p5/M, z26.s\n" + "uzp1 z29.h, z29.h, z29.h\n" + "ld1h { z25.h }, p5/Z, [x10]\n" + "ld1h { z24.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z28.h, z28.h, z28.h\n" + "uzp1 z27.h, z27.h, z27.h\n" + "uzp1 z26.h, z26.h, z26.h\n" + "trn1 z29.d, z29.d, z28.d\n" + ".inst 0x6479e7a8 // bfmmla z8.s, z29.h, z25.h\n" + "trn1 z27.d, z27.d, z26.d\n" + ".inst 0x6479e770 // bfmmla z16.s, z27.h, z25.h\n" + ".inst 0x6478e7ac // bfmmla z12.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6478e774 // bfmmla z20.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6479e7a9 // bfmmla z9.s, z29.h, z25.h\n" + ".inst 0x6479e771 // bfmmla z17.s, z27.h, z25.h\n" + ".inst 0x6478e7ad // bfmmla z13.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6478e775 // bfmmla z21.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6479e7aa // bfmmla z10.s, z29.h, z25.h\n" + ".inst 0x6479e772 // bfmmla z18.s, z27.h, z25.h\n" + ".inst 0x6478e7ae // bfmmla z14.s, z29.h, z24.h\n" + "ld1h { z25.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6478e776 // bfmmla z22.s, z27.h, z24.h\n" + "ld1h { z24.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6479e7ab // bfmmla z11.s, z29.h, z25.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" + ".inst 0x6479e773 // bfmmla z19.s, z27.h, z25.h\n" + ".inst 0x6478e7af // bfmmla z15.s, z29.h, z24.h\n" + ".inst 0x6478e777 // bfmmla z23.s, z27.h, z24.h\n" "50:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -942,41 +942,41 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "uzp2 z19.d, z19.d, z23.d\n" "tbz %x[flags], #1, 51f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" + "ld1rw { z24.s }, p5/Z, [x20]\n" "add x20, %x[args_ptr], %[offset_min]\n" - "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z6.s, p5/M, z6.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmax z6.s, p5/M, z6.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z6.s, p5/M, z6.s, z24.s\n" + "fmin z12.s, p5/M, z12.s, z24.s\n" + "fmin z13.s, p5/M, z13.s, z24.s\n" + "fmin z14.s, p5/M, z14.s, z24.s\n" + "fmin z8.s, p5/M, z8.s, z24.s\n" + "fmin z9.s, p5/M, z9.s, z24.s\n" + "fmin z10.s, p5/M, z10.s, z24.s\n" + "fmin z11.s, p5/M, z11.s, z24.s\n" + "fmin z15.s, p5/M, z15.s, z24.s\n" + "fmin z20.s, p5/M, z20.s, z24.s\n" + "fmin z21.s, p5/M, z21.s, z24.s\n" + "fmin z22.s, p5/M, z22.s, z24.s\n" + "fmin z16.s, p5/M, z16.s, z24.s\n" + "fmin z17.s, p5/M, z17.s, z24.s\n" + "fmin z18.s, p5/M, z18.s, z24.s\n" + "fmin z19.s, p5/M, z19.s, z24.s\n" + "fmax z6.s, p5/M, z6.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" "51:" // Height 4: No activation "st1w { z6.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1048,54 +1048,54 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "55:" // Height 5: no bias "tbz %x[flags], #0, 56f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x23, x24, x20, LSL #2\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x20]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z19.d, z24.d, z23.d\n" "zip2 z23.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z24.d, z25.d, z28.d\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 57f\n" "56:" // Height 5: no accumulate "mov z8.b, #0x0\n" @@ -1127,15 +1127,15 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "58:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 59f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 60f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1146,125 +1146,125 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 60f\n" "59:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" "60:" // Height 5: input setup done "cmp x27, #0x4\n" "ble 62f\n" "61:" // Height 5: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x26]\n" + "ld1rqw { z5.s }, p0/Z, [x25]\n" + ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n" + "ld1rqw { z4.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" + ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + "ld1rqw { z2.s }, p0/Z, [x22]\n" ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "uzp1 z5.h, z5.h, z5.h\n" + "uzp1 z4.h, z4.h, z4.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" "sub x27, x27, #0x4\n" "uzp1 z3.h, z3.h, z3.h\n" - ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" "cmp x27, #0x4\n" "add x26, x26, #0x10\n" - "trn1 z0.d, z0.d, z1.d\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "trn1 z6.d, z6.d, z5.d\n" + "trn1 z4.d, z4.d, z3.d\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" "add x25, x25, #0x10\n" - "uzp1 z4.h, z4.h, z4.h\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" "add x22, x22, #0x10\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" + ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" "bgt 61b\n" "62:" // Height 5: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" + "ld1rqw { z6.s }, p0/Z, [x26]\n" + "ld1rqw { z5.s }, p0/Z, [x25]\n" + ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n" + "ld1rqw { z4.s }, p0/Z, [x24]\n" "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z3.h, z3.h, z3.h\n" + ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" - "trn1 z0.d, z0.d, z1.d\n" - "trn1 z2.d, z2.d, z3.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "ld1rqw { z2.s }, p0/Z, [x22]\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + "uzp1 z6.h, z6.h, z6.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "uzp1 z5.h, z5.h, z5.h\n" "uzp1 z4.h, z4.h, z4.h\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z3.h, z3.h, z3.h\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "trn1 z6.d, z6.d, z5.d\n" + "trn1 z4.d, z4.d, z3.d\n" + ".inst 0x6461e4c8 // bfmmla z8.s, z6.h, z1.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + ".inst 0x6461e490 // bfmmla z16.s, z4.h, z1.h\n" + ".inst 0x6461e458 // bfmmla z24.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e4cc // bfmmla z12.s, z6.h, z0.h\n" + ".inst 0x6460e494 // bfmmla z20.s, z4.h, z0.h\n" + ".inst 0x6460e45c // bfmmla z28.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4c9 // bfmmla z9.s, z6.h, z1.h\n" + ".inst 0x6461e491 // bfmmla z17.s, z4.h, z1.h\n" + ".inst 0x6461e459 // bfmmla z25.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4cd // bfmmla z13.s, z6.h, z0.h\n" + ".inst 0x6460e495 // bfmmla z21.s, z4.h, z0.h\n" + ".inst 0x6460e45d // bfmmla z29.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4ca // bfmmla z10.s, z6.h, z1.h\n" + ".inst 0x6461e492 // bfmmla z18.s, z4.h, z1.h\n" + ".inst 0x6461e45a // bfmmla z26.s, z2.h, z1.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4ce // bfmmla z14.s, z6.h, z0.h\n" + ".inst 0x6460e496 // bfmmla z22.s, z4.h, z0.h\n" + ".inst 0x6460e45e // bfmmla z30.s, z2.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6461e4cb // bfmmla z11.s, z6.h, z1.h\n" "addvl x10, x10, #8\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + ".inst 0x6461e493 // bfmmla z19.s, z4.h, z1.h\n" + ".inst 0x6461e45b // bfmmla z27.s, z2.h, z1.h\n" + ".inst 0x6460e4cf // bfmmla z15.s, z6.h, z0.h\n" + ".inst 0x6460e497 // bfmmla z23.s, z4.h, z0.h\n" + ".inst 0x6460e45f // bfmmla z31.s, z2.h, z0.h\n" "63:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1297,49 +1297,49 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "uzp1 z27.d, z27.d, z31.d\n" "tbz %x[flags], #1, 64f\n" "add x20, %x[args_ptr], %[offset_max]\n" - "ld1rw { z1.s }, p5/Z, [x20]\n" - "add x20, %x[args_ptr], %[offset_min]\n" "ld1rw { z0.s }, p5/Z, [x20]\n" - "fmin z6.s, p5/M, z6.s, z1.s\n" - "fmin z12.s, p5/M, z12.s, z1.s\n" - "fmin z13.s, p5/M, z13.s, z1.s\n" - "fmin z14.s, p5/M, z14.s, z1.s\n" - "fmin z8.s, p5/M, z8.s, z1.s\n" - "fmin z9.s, p5/M, z9.s, z1.s\n" - "fmin z10.s, p5/M, z10.s, z1.s\n" - "fmin z11.s, p5/M, z11.s, z1.s\n" - "fmin z15.s, p5/M, z15.s, z1.s\n" - "fmin z20.s, p5/M, z20.s, z1.s\n" - "fmin z21.s, p5/M, z21.s, z1.s\n" - "fmin z22.s, p5/M, z22.s, z1.s\n" - "fmin z16.s, p5/M, z16.s, z1.s\n" - "fmin z17.s, p5/M, z17.s, z1.s\n" - "fmin z18.s, p5/M, z18.s, z1.s\n" - "fmin z19.s, p5/M, z19.s, z1.s\n" - "fmin z24.s, p5/M, z24.s, z1.s\n" - "fmin z25.s, p5/M, z25.s, z1.s\n" - "fmin z26.s, p5/M, z26.s, z1.s\n" - "fmin z27.s, p5/M, z27.s, z1.s\n" - "fmax z6.s, p5/M, z6.s, z0.s\n" - "fmax z12.s, p5/M, z12.s, z0.s\n" - "fmax z13.s, p5/M, z13.s, z0.s\n" - "fmax z14.s, p5/M, z14.s, z0.s\n" - "fmax z8.s, p5/M, z8.s, z0.s\n" - "fmax z9.s, p5/M, z9.s, z0.s\n" - "fmax z10.s, p5/M, z10.s, z0.s\n" - "fmax z11.s, p5/M, z11.s, z0.s\n" - "fmax z15.s, p5/M, z15.s, z0.s\n" - "fmax z20.s, p5/M, z20.s, z0.s\n" - "fmax z21.s, p5/M, z21.s, z0.s\n" - "fmax z22.s, p5/M, z22.s, z0.s\n" - "fmax z16.s, p5/M, z16.s, z0.s\n" - "fmax z17.s, p5/M, z17.s, z0.s\n" - "fmax z18.s, p5/M, z18.s, z0.s\n" - "fmax z19.s, p5/M, z19.s, z0.s\n" - "fmax z24.s, p5/M, z24.s, z0.s\n" - "fmax z25.s, p5/M, z25.s, z0.s\n" - "fmax z26.s, p5/M, z26.s, z0.s\n" - "fmax z27.s, p5/M, z27.s, z0.s\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1rw { z23.s }, p5/Z, [x20]\n" + "fmin z6.s, p5/M, z6.s, z0.s\n" + "fmin z12.s, p5/M, z12.s, z0.s\n" + "fmin z13.s, p5/M, z13.s, z0.s\n" + "fmin z14.s, p5/M, z14.s, z0.s\n" + "fmin z8.s, p5/M, z8.s, z0.s\n" + "fmin z9.s, p5/M, z9.s, z0.s\n" + "fmin z10.s, p5/M, z10.s, z0.s\n" + "fmin z11.s, p5/M, z11.s, z0.s\n" + "fmin z15.s, p5/M, z15.s, z0.s\n" + "fmin z20.s, p5/M, z20.s, z0.s\n" + "fmin z21.s, p5/M, z21.s, z0.s\n" + "fmin z22.s, p5/M, z22.s, z0.s\n" + "fmin z16.s, p5/M, z16.s, z0.s\n" + "fmin z17.s, p5/M, z17.s, z0.s\n" + "fmin z18.s, p5/M, z18.s, z0.s\n" + "fmin z19.s, p5/M, z19.s, z0.s\n" + "fmin z24.s, p5/M, z24.s, z0.s\n" + "fmin z25.s, p5/M, z25.s, z0.s\n" + "fmin z26.s, p5/M, z26.s, z0.s\n" + "fmin z27.s, p5/M, z27.s, z0.s\n" + "fmax z6.s, p5/M, z6.s, z23.s\n" + "fmax z12.s, p5/M, z12.s, z23.s\n" + "fmax z13.s, p5/M, z13.s, z23.s\n" + "fmax z14.s, p5/M, z14.s, z23.s\n" + "fmax z8.s, p5/M, z8.s, z23.s\n" + "fmax z9.s, p5/M, z9.s, z23.s\n" + "fmax z10.s, p5/M, z10.s, z23.s\n" + "fmax z11.s, p5/M, z11.s, z23.s\n" + "fmax z15.s, p5/M, z15.s, z23.s\n" + "fmax z20.s, p5/M, z20.s, z23.s\n" + "fmax z21.s, p5/M, z21.s, z23.s\n" + "fmax z22.s, p5/M, z22.s, z23.s\n" + "fmax z16.s, p5/M, z16.s, z23.s\n" + "fmax z17.s, p5/M, z17.s, z23.s\n" + "fmax z18.s, p5/M, z18.s, z23.s\n" + "fmax z19.s, p5/M, z19.s, z23.s\n" + "fmax z24.s, p5/M, z24.s, z23.s\n" + "fmax z25.s, p5/M, z25.s, z23.s\n" + "fmax z26.s, p5/M, z26.s, z23.s\n" + "fmax z27.s, p5/M, z27.s, z23.s\n" "64:" // Height 5: No activation "st1w { z6.s }, p4, [x9]\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1418,59 +1418,59 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "68:" // Height 6: no bias "tbz %x[flags], #0, 69f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x25, x9, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" + "ld1w { z17.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" "add x21, x22, x20, LSL #2\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x25]\n" - "zip1 z8.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x25, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x25, #2, MUL VL]\n" - "zip2 z12.d, z9.d, z12.d\n" - "zip1 z9.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x25, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x24]\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x24]\n" + "zip1 z8.d, z17.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" + "zip2 z12.d, z17.d, z12.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x23]\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z20.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip2 z14.d, z20.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x23]\n" + "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x22]\n" "zip2 z15.d, z16.d, z15.d\n" "zip1 z16.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x22]\n" + "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x21]\n" "zip2 z21.d, z18.d, z21.d\n" "zip1 z18.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z28.s }, p4/Z, [x21]\n" + "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z28.s }, p4/Z, [x20]\n" "zip2 z23.d, z24.d, z23.d\n" "zip1 z24.d, z25.d, z28.d\n" - "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" - "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 70f\n" "69:" // Height 6: no accumulate "mov z8.b, #0x0\n" @@ -1502,16 +1502,16 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "71:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 72f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 73f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20, LSL #2\n" @@ -1523,135 +1523,135 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "b 73f\n" "72:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "add x21, x22, x20, LSL #2\n" + "add x25, x26, x21, LSL #2\n" + "add x24, x25, x21, LSL #2\n" + "add x23, x24, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "add x21, x22, x21, LSL #2\n" "73:" // Height 6: input setup done "cmp x27, #0x4\n" "ble 75f\n" "74:" // Height 6: Multiply loop: Main loop head "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1rqw { z5.s }, p0/Z, [x21]\n" - ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + "ld1rqw { z7.s }, p0/Z, [x26]\n" + "ld1rqw { z6.s }, p0/Z, [x25]\n" + ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n" ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" + "uzp1 z5.h, z5.h, z5.h\n" "sub x27, x27, #0x4\n" "cmp x27, #0x4\n" - "uzp1 z3.h, z3.h, z3.h\n" "uzp1 z4.h, z4.h, z4.h\n" + "uzp1 z3.h, z3.h, z3.h\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "uzp1 z5.h, z5.h, z5.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" "add x24, x24, #0x10\n" - "trn1 z2.d, z2.d, z3.d\n" - "trn1 z4.d, z4.d, z5.d\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" + "trn1 z5.d, z5.d, z4.d\n" + "trn1 z3.d, z3.d, z2.d\n" + ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n" "add x23, x23, #0x10\n" - ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n" + ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" "add x21, x21, #0x10\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n" + ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n" + ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" + ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n" + ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n" + ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n" + ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n" "bgt 74b\n" "75:" // Height 6: Multiply loop: Single iteration only "whilelt p0.s, XZR, x27\n" - "ld1rqw { z0.s }, p0/Z, [x26]\n" - "ld1rqw { z1.s }, p0/Z, [x25]\n" - ".inst 0x658ab400 // bfcvt z0.h, p5/M, z0.s\n" - "ld1rqw { z2.s }, p0/Z, [x24]\n" - "ld1rqw { z3.s }, p0/Z, [x23]\n" - ".inst 0x658ab421 // bfcvt z1.h, p5/M, z1.s\n" - ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" - "ld1rqw { z4.s }, p0/Z, [x22]\n" - "ld1rqw { z5.s }, p0/Z, [x21]\n" - ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" - ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + "ld1rqw { z7.s }, p0/Z, [x26]\n" + "ld1rqw { z6.s }, p0/Z, [x25]\n" + ".inst 0x658ab4e7 // bfcvt z7.h, p5/M, z7.s\n" + "ld1rqw { z5.s }, p0/Z, [x24]\n" + "ld1rqw { z4.s }, p0/Z, [x23]\n" + ".inst 0x658ab4c6 // bfcvt z6.h, p5/M, z6.s\n" ".inst 0x658ab4a5 // bfcvt z5.h, p5/M, z5.s\n" - "uzp1 z0.h, z0.h, z0.h\n" - "ld1h { z6.h }, p5/Z, [x10]\n" - "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n" - "uzp1 z1.h, z1.h, z1.h\n" - "uzp1 z2.h, z2.h, z2.h\n" - "uzp1 z3.h, z3.h, z3.h\n" - "uzp1 z4.h, z4.h, z4.h\n" + "ld1rqw { z3.s }, p0/Z, [x22]\n" + "ld1rqw { z2.s }, p0/Z, [x21]\n" + ".inst 0x658ab484 // bfcvt z4.h, p5/M, z4.s\n" + ".inst 0x658ab463 // bfcvt z3.h, p5/M, z3.s\n" + ".inst 0x658ab442 // bfcvt z2.h, p5/M, z2.s\n" + "uzp1 z7.h, z7.h, z7.h\n" + "ld1h { z1.h }, p5/Z, [x10]\n" + "ld1h { z0.h }, p5/Z, [x10, #1, MUL VL]\n" + "uzp1 z6.h, z6.h, z6.h\n" "uzp1 z5.h, z5.h, z5.h\n" - "trn1 z0.d, z0.d, z1.d\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "trn1 z2.d, z2.d, z3.d\n" - "trn1 z4.d, z4.d, z5.d\n" - ".inst 0x6466e450 // bfmmla z16.s, z2.h, z6.h\n" - ".inst 0x6466e498 // bfmmla z24.s, z4.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x6467e454 // bfmmla z20.s, z2.h, z7.h\n" - ".inst 0x6467e49c // bfmmla z28.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6466e451 // bfmmla z17.s, z2.h, z6.h\n" - ".inst 0x6466e499 // bfmmla z25.s, z4.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" - ".inst 0x6467e49d // bfmmla z29.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6466e452 // bfmmla z18.s, z2.h, z6.h\n" - ".inst 0x6466e49a // bfmmla z26.s, z4.h, z6.h\n" - ".inst 0x6467e40e // bfmmla z14.s, z0.h, z7.h\n" - "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x6467e456 // bfmmla z22.s, z2.h, z7.h\n" - ".inst 0x6467e49e // bfmmla z30.s, z4.h, z7.h\n" - "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n" + "uzp1 z4.h, z4.h, z4.h\n" + "uzp1 z3.h, z3.h, z3.h\n" + "uzp1 z2.h, z2.h, z2.h\n" + "trn1 z7.d, z7.d, z6.d\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" + "trn1 z5.d, z5.d, z4.d\n" + "trn1 z3.d, z3.d, z2.d\n" + ".inst 0x6461e4b0 // bfmmla z16.s, z5.h, z1.h\n" + ".inst 0x6461e478 // bfmmla z24.s, z3.h, z1.h\n" + ".inst 0x6460e4ec // bfmmla z12.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x6460e4b4 // bfmmla z20.s, z5.h, z0.h\n" + ".inst 0x6460e47c // bfmmla z28.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x6461e4e9 // bfmmla z9.s, z7.h, z1.h\n" + ".inst 0x6461e4b1 // bfmmla z17.s, z5.h, z1.h\n" + ".inst 0x6461e479 // bfmmla z25.s, z3.h, z1.h\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x6460e4b5 // bfmmla z21.s, z5.h, z0.h\n" + ".inst 0x6460e47d // bfmmla z29.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" + ".inst 0x6461e4b2 // bfmmla z18.s, z5.h, z1.h\n" + ".inst 0x6461e47a // bfmmla z26.s, z3.h, z1.h\n" + ".inst 0x6460e4ee // bfmmla z14.s, z7.h, z0.h\n" + "ld1h { z1.h }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x6460e4b6 // bfmmla z22.s, z5.h, z0.h\n" + ".inst 0x6460e47e // bfmmla z30.s, z3.h, z0.h\n" + "ld1h { z0.h }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x6466e40b // bfmmla z11.s, z0.h, z6.h\n" - ".inst 0x6466e453 // bfmmla z19.s, z2.h, z6.h\n" - ".inst 0x6466e49b // bfmmla z27.s, z4.h, z6.h\n" - ".inst 0x6467e40f // bfmmla z15.s, z0.h, z7.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6467e49f // bfmmla z31.s, z4.h, z7.h\n" + ".inst 0x6461e4eb // bfmmla z11.s, z7.h, z1.h\n" + ".inst 0x6461e4b3 // bfmmla z19.s, z5.h, z1.h\n" + ".inst 0x6461e47b // bfmmla z27.s, z3.h, z1.h\n" + ".inst 0x6460e4ef // bfmmla z15.s, z7.h, z0.h\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6460e47f // bfmmla z31.s, z3.h, z0.h\n" "76:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1782,7 +1782,6 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1790,4 +1789,4 @@ void sve_hybrid_fp32bf16fp32_mmla_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp index c8a7d66f28..ffc1606b3f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp index 562b2759aa..b7c523466e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp @@ -104,11 +104,11 @@ void sve_hybrid_s8qa_dot_4x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -121,39 +121,39 @@ void sve_hybrid_s8qa_dot_4x4VL ( "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "sdot z16.s, z20.b, z0.b[0]\n" + "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z17.s, z21.b, z0.b[0]\n" + "sdot z18.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z19.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n" + "sdot z16.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n" + "sdot z17.s, z21.b, z0.b[1]\n" + "sdot z18.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" + "sdot z19.s, z20.b, z0.b[1]\n" + "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n" + "sdot z16.s, z22.b, z0.b[2]\n" + "sdot z17.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[2]\n" + "sdot z19.s, z20.b, z0.b[2]\n" + "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n" + "sdot z16.s, z22.b, z0.b[3]\n" + "sdot z17.s, z20.b, z0.b[3]\n" + "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[3]\n" + "sdot z19.s, z20.b, z0.b[3]\n" "add x24, x24, #0x10\n" "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" @@ -164,47 +164,47 @@ void sve_hybrid_s8qa_dot_4x4VL ( "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z19.s, z7.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z16.s, z22.b, z0.b[0]\n" + "sdot z17.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[0]\n" + "sdot z19.s, z20.b, z0.b[0]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z19.s, z4.b, z0.b[1]\n" + "sdot z16.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z22.b, z0.b[1]\n" + "sdot z18.s, z21.b, z0.b[1]\n" + "sdot z19.s, z20.b, z0.b[1]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" + "sdot z16.s, z20.b, z0.b[2]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z22.b, z0.b[2]\n" + "sdot z18.s, z21.b, z0.b[2]\n" + "sdot z19.s, z20.b, z0.b[2]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" + "ld1b { z21.b }, p2/Z, [x28]\n" + "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z21.b, z0.b[3]\n" + "sdot z17.s, z20.b, z0.b[3]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[3]\n" + "sdot z19.s, z20.b, z0.b[3]\n" "addvl x28, x28, #4\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" @@ -218,71 +218,71 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" - "neg z1.s, p2/M, z1.s\n" - "mul z11.s, p2/M, z11.s, z1.s\n" + "neg z20.s, p2/M, z20.s\n" + "mul z11.s, p2/M, z11.s, z20.s\n" "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [x10]\n" + "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "add z16.s, z16.s, z23.s\n" + "add z17.s, z17.s, z22.s\n" + "add z18.s, z18.s, z21.s\n" + "add z19.s, z19.s, z20.s\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n" + ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n" "addvl x10, x10, #4\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" + ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n" "tbz %x[flags], #5, 13f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z23.d, z16.d, z0.d\n" + "and z22.d, z17.d, z0.d\n" + "and z21.d, z18.d, z0.d\n" + "and z20.d, z19.d, z0.d\n" + "asr z23.s, z23.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z16.s, z16.s, z23.s\n" + "sqadd z17.s, z17.s, z22.s\n" + "sqadd z18.s, z18.s, z21.s\n" + "sqadd z19.s, z19.s, z20.s\n" "13:" // Height 1: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z20.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z20.s\n" + "add z18.s, z18.s, z20.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z4.s\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z20.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z21.s\n" + "smin z17.s, p2/M, z17.s, z21.s\n" + "smin z18.s, p2/M, z18.s, z21.s\n" + "smin z19.s, p2/M, z19.s, z21.s\n" + "smax z16.s, p2/M, z16.s, z20.s\n" + "smax z17.s, p2/M, z17.s, z20.s\n" + "smax z18.s, p2/M, z18.s, z20.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z20.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" "st1b { z16.b }, p1, [x27]\n" @@ -317,12 +317,12 @@ void sve_hybrid_s8qa_dot_4x4VL ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -330,7 +330,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "b 20f\n" "19:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "20:" // Height 2: input setup done "cmp x25, #0x10\n" "ble 23f\n" @@ -339,56 +339,56 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "sdot z21.s, z5.b, z1.b[0]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "sdot z23.s, z7.b, z1.b[0]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z24.b, z0.b[0]\n" + "sdot z20.s, z24.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z26.b, z0.b[0]\n" + "sdot z21.s, z26.b, z1.b[0]\n" + "sdot z18.s, z24.b, z0.b[0]\n" + "sdot z22.s, z24.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n" + "sdot z19.s, z25.b, z0.b[0]\n" + "sdot z23.s, z25.b, z1.b[0]\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "sdot z20.s, z8.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z21.s, z9.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "sdot z23.s, z4.b, z1.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" + "sdot z16.s, z24.b, z0.b[1]\n" + "sdot z20.s, z24.b, z1.b[1]\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + "sdot z17.s, z27.b, z0.b[1]\n" + "sdot z21.s, z27.b, z1.b[1]\n" + "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" + "sdot z18.s, z26.b, z0.b[1]\n" + "sdot z22.s, z26.b, z1.b[1]\n" + "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "sdot z19.s, z25.b, z0.b[1]\n" + "sdot z23.s, z25.b, z1.b[1]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "sdot z16.s, z24.b, z0.b[2]\n" + "sdot z20.s, z24.b, z1.b[2]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" "add x23, x23, #0x10\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "sdot z21.s, z6.b, z1.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "sdot z23.s, z8.b, z1.b[2]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z17.s, z30.b, z0.b[2]\n" + "sdot z21.s, z30.b, z1.b[2]\n" + "sdot z18.s, z29.b, z0.b[2]\n" + "sdot z22.s, z29.b, z1.b[2]\n" + "sdot z19.s, z28.b, z0.b[2]\n" + "sdot z23.s, z28.b, z1.b[2]\n" + "sdot z16.s, z27.b, z0.b[3]\n" + "sdot z20.s, z27.b, z1.b[3]\n" + "sdot z17.s, z26.b, z0.b[3]\n" + "sdot z21.s, z26.b, z1.b[3]\n" + "sdot z18.s, z25.b, z0.b[3]\n" + "sdot z22.s, z25.b, z1.b[3]\n" + "sdot z19.s, z24.b, z0.b[3]\n" + "sdot z23.s, z24.b, z1.b[3]\n" "tbnz %x[flags], #31, 22f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" @@ -401,63 +401,63 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "subs x25, x25, #0x4\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "sdot z21.s, z5.b, z1.b[0]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z24.b, z0.b[0]\n" + "sdot z20.s, z24.b, z1.b[0]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z26.b, z0.b[0]\n" + "sdot z21.s, z26.b, z1.b[0]\n" + "sdot z18.s, z25.b, z0.b[0]\n" + "sdot z22.s, z25.b, z1.b[0]\n" "addvl x28, x28, #4\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "sdot z23.s, z7.b, z1.b[0]\n" + "sdot z19.s, z24.b, z0.b[0]\n" + "sdot z23.s, z24.b, z1.b[0]\n" "ble 24f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[1]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z21.s, z9.b, z1.b[1]\n" - "sdot z18.s, z10.b, z0.b[1]\n" + "sdot z16.s, z27.b, z0.b[1]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z27.b, z1.b[1]\n" + "sdot z17.s, z26.b, z0.b[1]\n" + "sdot z21.s, z26.b, z1.b[1]\n" + "sdot z18.s, z25.b, z0.b[1]\n" "addvl x28, x28, #4\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "sdot z23.s, z4.b, z1.b[1]\n" + "sdot z22.s, z25.b, z1.b[1]\n" + "sdot z19.s, z24.b, z0.b[1]\n" + "sdot z23.s, z24.b, z1.b[1]\n" "ble 24f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "sdot z21.s, z6.b, z1.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" + "sdot z16.s, z27.b, z0.b[2]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z27.b, z1.b[2]\n" + "sdot z17.s, z26.b, z0.b[2]\n" + "sdot z21.s, z26.b, z1.b[2]\n" + "sdot z18.s, z25.b, z0.b[2]\n" "addvl x28, x28, #4\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "sdot z23.s, z8.b, z1.b[2]\n" + "sdot z22.s, z25.b, z1.b[2]\n" + "sdot z19.s, z24.b, z0.b[2]\n" + "sdot z23.s, z24.b, z1.b[2]\n" "ble 24f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z24.b, z0.b[3]\n" + "sdot z20.s, z24.b, z1.b[3]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z17.s, z26.b, z0.b[3]\n" + "sdot z21.s, z26.b, z1.b[3]\n" + "sdot z18.s, z25.b, z0.b[3]\n" + "sdot z22.s, z25.b, z1.b[3]\n" "addvl x28, x28, #4\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" + "sdot z19.s, z24.b, z0.b[3]\n" + "sdot z23.s, z24.b, z1.b[3]\n" "24:" // Height 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 25f\n" "sdot z11.s, z0.b, z15.b\n" @@ -473,120 +473,120 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "saddv d12, p0, z12.s\n" - "neg z2.s, p2/M, z2.s\n" + "neg z24.s, p2/M, z24.s\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z2.s\n" - "mul z12.s, p2/M, z12.s, z2.s\n" + "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z12.s, p2/M, z12.s, z24.s\n" "26:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10]\n" + "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" + "add z16.s, z16.s, z28.s\n" + "add z17.s, z17.s, z27.s\n" "addvl x10, x10, #4\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" + "add z18.s, z18.s, z26.s\n" + "add z19.s, z19.s, z25.s\n" + "add z20.s, z20.s, z28.s\n" + "add z21.s, z21.s, z27.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + "add z22.s, z22.s, z26.s\n" + "add z23.s, z23.s, z25.s\n" + ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" + ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" + ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" + ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" + ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" + ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" + ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" + ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" "tbz %x[flags], #5, 27f\n" - "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" + "and z24.d, z16.d, z0.d\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z16.s, z16.s, z24.s\n" + "and z30.d, z17.d, z0.d\n" + "and z29.d, z18.d, z0.d\n" + "and z28.d, z19.d, z0.d\n" + "and z27.d, z20.d, z0.d\n" + "and z26.d, z21.d, z0.d\n" + "and z25.d, z22.d, z0.d\n" + "and z24.d, z23.d, z0.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z17.s, z17.s, z30.s\n" + "sqadd z18.s, z18.s, z29.s\n" + "sqadd z19.s, z19.s, z28.s\n" + "sqadd z20.s, z20.s, z27.s\n" + "sqadd z21.s, z21.s, z26.s\n" + "sqadd z22.s, z22.s, z25.s\n" + "sqadd z23.s, z23.s, z24.s\n" "27:" // Height 2: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z24.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z24.s\n" + "add z18.s, z18.s, z24.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z24.s\n" + "add z20.s, z20.s, z24.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z24.s\n" + "add z22.s, z22.s, z24.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z4.s\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "add z23.s, z23.s, z24.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z25.s\n" + "smin z17.s, p2/M, z17.s, z25.s\n" + "smin z18.s, p2/M, z18.s, z25.s\n" + "smin z19.s, p2/M, z19.s, z25.s\n" + "smin z20.s, p2/M, z20.s, z25.s\n" + "smin z21.s, p2/M, z21.s, z25.s\n" + "smin z22.s, p2/M, z22.s, z25.s\n" + "smin z23.s, p2/M, z23.s, z25.s\n" + "smax z16.s, p2/M, z16.s, z24.s\n" + "smax z17.s, p2/M, z17.s, z24.s\n" + "smax z18.s, p2/M, z18.s, z24.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z24.s\n" + "smax z20.s, p2/M, z20.s, z24.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z24.s\n" + "smax z22.s, p2/M, z22.s, z24.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" + "smax z23.s, p2/M, z23.s, z24.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" "st1b { z20.b }, p1, [x23]\n" "addvl x27, x27, #1\n" "28:" // Height 2: Writeback done @@ -624,13 +624,13 @@ void sve_hybrid_s8qa_dot_4x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -639,8 +639,8 @@ void sve_hybrid_s8qa_dot_4x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "34:" // Height 3: input setup done "cmp x25, #0x10\n" "ble 37f\n" @@ -650,73 +650,73 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[0]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z21.s, z5.b, z1.b[0]\n" - "sdot z25.s, z5.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "sdot z26.s, z6.b, z2.b[0]\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28]\n" + "sdot z16.s, z28.b, z0.b[0]\n" + "sdot z20.s, z28.b, z1.b[0]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z28.b, z2.b[0]\n" + "sdot z17.s, z30.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z21.s, z30.b, z1.b[0]\n" + "sdot z25.s, z30.b, z2.b[0]\n" + "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n" + "sdot z18.s, z29.b, z0.b[0]\n" + "sdot z22.s, z29.b, z1.b[0]\n" + "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n" + "sdot z26.s, z29.b, z2.b[0]\n" + "sdot z19.s, z28.b, z0.b[0]\n" + "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "sdot z23.s, z7.b, z1.b[0]\n" - "sdot z27.s, z7.b, z2.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "sdot z20.s, z8.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + "sdot z23.s, z28.b, z1.b[0]\n" + "sdot z27.s, z28.b, z2.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n" + "sdot z16.s, z3.b, z0.b[1]\n" + "sdot z20.s, z3.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n" "add x23, x23, #0x10\n" - "sdot z24.s, z8.b, z2.b[1]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + "sdot z24.s, z3.b, z2.b[1]\n" + "sdot z17.s, z31.b, z0.b[1]\n" + "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n" "add x22, x22, #0x10\n" - "sdot z21.s, z9.b, z1.b[1]\n" - "sdot z25.s, z9.b, z2.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "sdot z26.s, z10.b, z2.b[1]\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z23.s, z4.b, z1.b[1]\n" - "sdot z27.s, z4.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "sdot z24.s, z5.b, z2.b[2]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z21.s, z6.b, z1.b[2]\n" - "sdot z25.s, z6.b, z2.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z26.s, z7.b, z2.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "sdot z23.s, z8.b, z1.b[2]\n" - "sdot z27.s, z8.b, z2.b[2]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "sdot z24.s, z9.b, z2.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" - "sdot z25.s, z10.b, z2.b[3]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" - "sdot z26.s, z4.b, z2.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" - "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z21.s, z31.b, z1.b[1]\n" + "sdot z25.s, z31.b, z2.b[1]\n" + "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n" + "sdot z18.s, z30.b, z0.b[1]\n" + "sdot z22.s, z30.b, z1.b[1]\n" + "sdot z26.s, z30.b, z2.b[1]\n" + "sdot z19.s, z29.b, z0.b[1]\n" + "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n" + "sdot z23.s, z29.b, z1.b[1]\n" + "sdot z27.s, z29.b, z2.b[1]\n" + "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n" + "sdot z16.s, z28.b, z0.b[2]\n" + "sdot z20.s, z28.b, z1.b[2]\n" + "sdot z24.s, z28.b, z2.b[2]\n" + "sdot z17.s, z5.b, z0.b[2]\n" + "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n" + "sdot z21.s, z5.b, z1.b[2]\n" + "sdot z25.s, z5.b, z2.b[2]\n" + "sdot z18.s, z4.b, z0.b[2]\n" + "sdot z22.s, z4.b, z1.b[2]\n" + "sdot z26.s, z4.b, z2.b[2]\n" + "sdot z19.s, z3.b, z0.b[2]\n" + "sdot z23.s, z3.b, z1.b[2]\n" + "sdot z27.s, z3.b, z2.b[2]\n" + "sdot z16.s, z31.b, z0.b[3]\n" + "sdot z20.s, z31.b, z1.b[3]\n" + "sdot z24.s, z31.b, z2.b[3]\n" + "sdot z17.s, z30.b, z0.b[3]\n" + "sdot z21.s, z30.b, z1.b[3]\n" + "sdot z25.s, z30.b, z2.b[3]\n" + "sdot z18.s, z29.b, z0.b[3]\n" + "sdot z22.s, z29.b, z1.b[3]\n" + "sdot z26.s, z29.b, z2.b[3]\n" + "sdot z19.s, z28.b, z0.b[3]\n" + "sdot z23.s, z28.b, z1.b[3]\n" + "sdot z27.s, z28.b, z2.b[3]\n" "tbnz %x[flags], #31, 36f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" @@ -731,79 +731,79 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z1.b }, p0/Z, [x23]\n" "subs x25, x25, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[0]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z21.s, z5.b, z1.b[0]\n" - "sdot z25.s, z5.b, z2.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28]\n" + "sdot z16.s, z28.b, z0.b[0]\n" + "sdot z20.s, z28.b, z1.b[0]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z24.s, z28.b, z2.b[0]\n" + "sdot z17.s, z30.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z21.s, z30.b, z1.b[0]\n" + "sdot z25.s, z30.b, z2.b[0]\n" "addvl x28, x28, #4\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" - "sdot z26.s, z6.b, z2.b[0]\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "sdot z23.s, z7.b, z1.b[0]\n" - "sdot z27.s, z7.b, z2.b[0]\n" + "sdot z18.s, z29.b, z0.b[0]\n" + "sdot z22.s, z29.b, z1.b[0]\n" + "sdot z26.s, z29.b, z2.b[0]\n" + "sdot z19.s, z28.b, z0.b[0]\n" + "sdot z23.s, z28.b, z1.b[0]\n" + "sdot z27.s, z28.b, z2.b[0]\n" "ble 38f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[1]\n" - "sdot z24.s, z8.b, z2.b[1]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z21.s, z9.b, z1.b[1]\n" + "sdot z16.s, z31.b, z0.b[1]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z31.b, z1.b[1]\n" + "sdot z24.s, z31.b, z2.b[1]\n" + "sdot z17.s, z30.b, z0.b[1]\n" + "sdot z21.s, z30.b, z1.b[1]\n" "addvl x28, x28, #4\n" - "sdot z25.s, z9.b, z2.b[1]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "sdot z26.s, z10.b, z2.b[1]\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "sdot z23.s, z4.b, z1.b[1]\n" - "sdot z27.s, z4.b, z2.b[1]\n" + "sdot z25.s, z30.b, z2.b[1]\n" + "sdot z18.s, z29.b, z0.b[1]\n" + "sdot z22.s, z29.b, z1.b[1]\n" + "sdot z26.s, z29.b, z2.b[1]\n" + "sdot z19.s, z28.b, z0.b[1]\n" + "sdot z23.s, z28.b, z1.b[1]\n" + "sdot z27.s, z28.b, z2.b[1]\n" "ble 38f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "sdot z24.s, z5.b, z2.b[2]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "sdot z21.s, z6.b, z1.b[2]\n" + "sdot z16.s, z31.b, z0.b[2]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z31.b, z1.b[2]\n" + "sdot z24.s, z31.b, z2.b[2]\n" + "sdot z17.s, z30.b, z0.b[2]\n" + "sdot z21.s, z30.b, z1.b[2]\n" "addvl x28, x28, #4\n" - "sdot z25.s, z6.b, z2.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z26.s, z7.b, z2.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "sdot z23.s, z8.b, z1.b[2]\n" - "sdot z27.s, z8.b, z2.b[2]\n" + "sdot z25.s, z30.b, z2.b[2]\n" + "sdot z18.s, z29.b, z0.b[2]\n" + "sdot z22.s, z29.b, z1.b[2]\n" + "sdot z26.s, z29.b, z2.b[2]\n" + "sdot z19.s, z28.b, z0.b[2]\n" + "sdot z23.s, z28.b, z1.b[2]\n" + "sdot z27.s, z28.b, z2.b[2]\n" "ble 38f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z24.s, z9.b, z2.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" - "sdot z25.s, z10.b, z2.b[3]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z31.b, z0.b[3]\n" + "sdot z20.s, z31.b, z1.b[3]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z24.s, z31.b, z2.b[3]\n" + "sdot z17.s, z30.b, z0.b[3]\n" + "sdot z21.s, z30.b, z1.b[3]\n" + "sdot z25.s, z30.b, z2.b[3]\n" "addvl x28, x28, #4\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" - "sdot z26.s, z4.b, z2.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" - "sdot z27.s, z5.b, z2.b[3]\n" + "sdot z18.s, z29.b, z0.b[3]\n" + "sdot z22.s, z29.b, z1.b[3]\n" + "sdot z26.s, z29.b, z2.b[3]\n" + "sdot z19.s, z28.b, z0.b[3]\n" + "sdot z23.s, z28.b, z1.b[3]\n" + "sdot z27.s, z28.b, z2.b[3]\n" "38:" // Height 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 39f\n" "sdot z11.s, z0.b, z15.b\n" @@ -821,33 +821,33 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z3.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "saddv d12, p0, z12.s\n" "saddv d13, p0, z13.s\n" "mov z12.s, z12.s[0]\n" "mov z13.s, z13.s[0]\n" - "neg z3.s, p2/M, z3.s\n" - "mul z11.s, p2/M, z11.s, z3.s\n" - "mul z12.s, p2/M, z12.s, z3.s\n" - "mul z13.s, p2/M, z13.s, z3.s\n" + "neg z28.s, p2/M, z28.s\n" + "mul z11.s, p2/M, z11.s, z28.s\n" + "mul z12.s, p2/M, z12.s, z28.s\n" + "mul z13.s, p2/M, z13.s, z28.s\n" "40:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" @@ -855,133 +855,133 @@ void sve_hybrid_s8qa_dot_4x4VL ( "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "add z17.s, z17.s, z31.s\n" + "add z18.s, z18.s, z30.s\n" + "add z19.s, z19.s, z29.s\n" "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" + "add z21.s, z21.s, z31.s\n" + "add z22.s, z22.s, z30.s\n" + "add z23.s, z23.s, z29.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" + "add z25.s, z25.s, z31.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z26.s, z26.s, z30.s\n" + "add z27.s, z27.s, z29.s\n" + ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n" + ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n" + ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n" + ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n" + ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n" + ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n" + ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n" + ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n" + ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n" + ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n" + ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n" + ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n" "tbz %x[flags], #5, 41f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" + "and z1.d, z16.d, z0.d\n" + "and z31.d, z17.d, z0.d\n" + "and z30.d, z18.d, z0.d\n" + "and z29.d, z19.d, z0.d\n" + "and z28.d, z20.d, z0.d\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z16.s, z16.s, z1.s\n" + "sqadd z17.s, z17.s, z31.s\n" + "sqadd z18.s, z18.s, z30.s\n" + "sqadd z19.s, z19.s, z29.s\n" + "sqadd z20.s, z20.s, z28.s\n" + "and z3.d, z21.d, z0.d\n" + "and z2.d, z22.d, z0.d\n" + "and z1.d, z23.d, z0.d\n" + "and z31.d, z24.d, z0.d\n" + "and z30.d, z25.d, z0.d\n" + "and z29.d, z26.d, z0.d\n" + "and z28.d, z27.d, z0.d\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z21.s, z21.s, z3.s\n" + "sqadd z22.s, z22.s, z2.s\n" + "sqadd z23.s, z23.s, z1.s\n" + "sqadd z24.s, z24.s, z31.s\n" + "sqadd z25.s, z25.s, z30.s\n" + "sqadd z26.s, z26.s, z29.s\n" + "sqadd z27.s, z27.s, z28.s\n" "41:" // Height 3: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z28.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z28.s\n" + "add z18.s, z18.s, z28.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z28.s\n" + "add z20.s, z20.s, z28.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z28.s\n" + "add z22.s, z22.s, z28.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z23.s, z23.s, z28.s\n" + "add z24.s, z24.s, z28.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z28.s\n" + "add z26.s, z26.s, z28.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z4.s\n" + "ld1rw { z29.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z28.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z29.s\n" + "smin z17.s, p2/M, z17.s, z29.s\n" + "smin z18.s, p2/M, z18.s, z29.s\n" + "smin z19.s, p2/M, z19.s, z29.s\n" + "smin z20.s, p2/M, z20.s, z29.s\n" + "smin z21.s, p2/M, z21.s, z29.s\n" + "smin z22.s, p2/M, z22.s, z29.s\n" + "smin z23.s, p2/M, z23.s, z29.s\n" + "smin z24.s, p2/M, z24.s, z29.s\n" + "smin z25.s, p2/M, z25.s, z29.s\n" + "smin z26.s, p2/M, z26.s, z29.s\n" + "smin z27.s, p2/M, z27.s, z29.s\n" + "smax z16.s, p2/M, z16.s, z28.s\n" + "smax z17.s, p2/M, z17.s, z28.s\n" + "smax z18.s, p2/M, z18.s, z28.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z28.s\n" + "smax z20.s, p2/M, z20.s, z28.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z28.s\n" + "smax z22.s, p2/M, z22.s, z28.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z28.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z28.s\n" "uzp1 z24.h, z24.h, z25.h\n" "st1b { z20.b }, p1, [x23]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" + "smax z27.s, p2/M, z27.s, z28.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" "st1b { z24.b }, p1, [x22]\n" "addvl x27, x27, #1\n" "42:" // Height 3: Writeback done @@ -1027,14 +1027,14 @@ void sve_hybrid_s8qa_dot_4x4VL ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1044,9 +1044,9 @@ void sve_hybrid_s8qa_dot_4x4VL ( "b 48f\n" "47:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "48:" // Height 4: input setup done "cmp x25, #0x10\n" "ble 51f\n" @@ -1059,88 +1059,88 @@ void sve_hybrid_s8qa_dot_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x21]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[0]\n" - "sdot z28.s, z4.b, z3.b[0]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z5.b, z0.b[0]\n" + "sdot z20.s, z5.b, z1.b[0]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z24.s, z5.b, z2.b[0]\n" + "sdot z28.s, z5.b, z3.b[0]\n" + "sdot z17.s, z4.b, z0.b[0]\n" + "sdot z21.s, z4.b, z1.b[0]\n" "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "sdot z25.s, z5.b, z2.b[0]\n" - "sdot z29.s, z5.b, z3.b[0]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "sdot z25.s, z4.b, z2.b[0]\n" + "sdot z29.s, z4.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" "addvl x28, x28, #16\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "sdot z26.s, z6.b, z2.b[0]\n" - "sdot z30.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" "add x21, x21, #0x10\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "sdot z23.s, z7.b, z1.b[0]\n" - "sdot z27.s, z7.b, z2.b[0]\n" - "sdot z31.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + "sdot z19.s, z9.b, z0.b[0]\n" + "sdot z23.s, z9.b, z1.b[0]\n" + "sdot z27.s, z9.b, z2.b[0]\n" + "sdot z31.s, z9.b, z3.b[0]\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" "sdot z16.s, z8.b, z0.b[1]\n" "sdot z20.s, z8.b, z1.b[1]\n" "sdot z24.s, z8.b, z2.b[1]\n" "sdot z28.s, z8.b, z3.b[1]\n" "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "sdot z17.s, z9.b, z0.b[1]\n" - "sdot z21.s, z9.b, z1.b[1]\n" - "sdot z25.s, z9.b, z2.b[1]\n" - "sdot z29.s, z9.b, z3.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "sdot z26.s, z10.b, z2.b[1]\n" - "sdot z30.s, z10.b, z3.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "sdot z19.s, z4.b, z0.b[1]\n" - "sdot z23.s, z4.b, z1.b[1]\n" - "sdot z27.s, z4.b, z2.b[1]\n" - "sdot z31.s, z4.b, z3.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "sdot z24.s, z5.b, z2.b[2]\n" - "sdot z28.s, z5.b, z3.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "sdot z17.s, z6.b, z0.b[2]\n" - "sdot z21.s, z6.b, z1.b[2]\n" - "sdot z25.s, z6.b, z2.b[2]\n" - "sdot z29.s, z6.b, z3.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z26.s, z7.b, z2.b[2]\n" - "sdot z30.s, z7.b, z3.b[2]\n" + "sdot z17.s, z7.b, z0.b[1]\n" + "sdot z21.s, z7.b, z1.b[1]\n" + "sdot z25.s, z7.b, z2.b[1]\n" + "sdot z29.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "sdot z18.s, z6.b, z0.b[1]\n" + "sdot z22.s, z6.b, z1.b[1]\n" + "sdot z26.s, z6.b, z2.b[1]\n" + "sdot z30.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + "sdot z19.s, z5.b, z0.b[1]\n" + "sdot z23.s, z5.b, z1.b[1]\n" + "sdot z27.s, z5.b, z2.b[1]\n" + "sdot z31.s, z5.b, z3.b[1]\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + "sdot z16.s, z4.b, z0.b[2]\n" + "sdot z20.s, z4.b, z1.b[2]\n" + "sdot z24.s, z4.b, z2.b[2]\n" + "sdot z28.s, z4.b, z3.b[2]\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + "sdot z17.s, z10.b, z0.b[2]\n" + "sdot z21.s, z10.b, z1.b[2]\n" + "sdot z25.s, z10.b, z2.b[2]\n" + "sdot z29.s, z10.b, z3.b[2]\n" + "sdot z18.s, z9.b, z0.b[2]\n" + "sdot z22.s, z9.b, z1.b[2]\n" + "sdot z26.s, z9.b, z2.b[2]\n" + "sdot z30.s, z9.b, z3.b[2]\n" "sdot z19.s, z8.b, z0.b[2]\n" "sdot z23.s, z8.b, z1.b[2]\n" "sdot z27.s, z8.b, z2.b[2]\n" "sdot z31.s, z8.b, z3.b[2]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "sdot z24.s, z9.b, z2.b[3]\n" - "sdot z28.s, z9.b, z3.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" - "sdot z25.s, z10.b, z2.b[3]\n" - "sdot z29.s, z10.b, z3.b[3]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" - "sdot z26.s, z4.b, z2.b[3]\n" - "sdot z30.s, z4.b, z3.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" - "sdot z27.s, z5.b, z2.b[3]\n" - "sdot z31.s, z5.b, z3.b[3]\n" + "sdot z16.s, z7.b, z0.b[3]\n" + "sdot z20.s, z7.b, z1.b[3]\n" + "sdot z24.s, z7.b, z2.b[3]\n" + "sdot z28.s, z7.b, z3.b[3]\n" + "sdot z17.s, z6.b, z0.b[3]\n" + "sdot z21.s, z6.b, z1.b[3]\n" + "sdot z25.s, z6.b, z2.b[3]\n" + "sdot z29.s, z6.b, z3.b[3]\n" + "sdot z18.s, z5.b, z0.b[3]\n" + "sdot z22.s, z5.b, z1.b[3]\n" + "sdot z26.s, z5.b, z2.b[3]\n" + "sdot z30.s, z5.b, z3.b[3]\n" + "sdot z19.s, z4.b, z0.b[3]\n" + "sdot z23.s, z4.b, z1.b[3]\n" + "sdot z27.s, z4.b, z2.b[3]\n" + "sdot z31.s, z4.b, z3.b[3]\n" "tbnz %x[flags], #31, 50f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z12.s, z1.b, z15.b\n" @@ -1157,95 +1157,95 @@ void sve_hybrid_s8qa_dot_4x4VL ( "subs x25, x25, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" "ld1rqb { z3.b }, p0/Z, [x21]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z4.b, z0.b[0]\n" - "sdot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z24.s, z4.b, z2.b[0]\n" - "sdot z28.s, z4.b, z3.b[0]\n" - "sdot z17.s, z5.b, z0.b[0]\n" - "sdot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z7.b, z0.b[0]\n" + "sdot z20.s, z7.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z24.s, z7.b, z2.b[0]\n" + "sdot z28.s, z7.b, z3.b[0]\n" + "sdot z17.s, z6.b, z0.b[0]\n" + "sdot z21.s, z6.b, z1.b[0]\n" "addvl x28, x28, #4\n" - "sdot z25.s, z5.b, z2.b[0]\n" - "sdot z29.s, z5.b, z3.b[0]\n" - "sdot z18.s, z6.b, z0.b[0]\n" - "sdot z22.s, z6.b, z1.b[0]\n" - "sdot z26.s, z6.b, z2.b[0]\n" - "sdot z30.s, z6.b, z3.b[0]\n" - "sdot z19.s, z7.b, z0.b[0]\n" - "sdot z23.s, z7.b, z1.b[0]\n" - "sdot z27.s, z7.b, z2.b[0]\n" - "sdot z31.s, z7.b, z3.b[0]\n" + "sdot z25.s, z6.b, z2.b[0]\n" + "sdot z29.s, z6.b, z3.b[0]\n" + "sdot z18.s, z5.b, z0.b[0]\n" + "sdot z22.s, z5.b, z1.b[0]\n" + "sdot z26.s, z5.b, z2.b[0]\n" + "sdot z30.s, z5.b, z3.b[0]\n" + "sdot z19.s, z4.b, z0.b[0]\n" + "sdot z23.s, z4.b, z1.b[0]\n" + "sdot z27.s, z4.b, z2.b[0]\n" + "sdot z31.s, z4.b, z3.b[0]\n" "ble 52f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "sdot z16.s, z7.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z8.b, z1.b[1]\n" - "sdot z24.s, z8.b, z2.b[1]\n" - "sdot z28.s, z8.b, z3.b[1]\n" - "sdot z17.s, z9.b, z0.b[1]\n" + "sdot z20.s, z7.b, z1.b[1]\n" + "sdot z24.s, z7.b, z2.b[1]\n" + "sdot z28.s, z7.b, z3.b[1]\n" + "sdot z17.s, z6.b, z0.b[1]\n" "addvl x28, x28, #4\n" - "sdot z21.s, z9.b, z1.b[1]\n" - "sdot z25.s, z9.b, z2.b[1]\n" - "sdot z29.s, z9.b, z3.b[1]\n" - "sdot z18.s, z10.b, z0.b[1]\n" - "sdot z22.s, z10.b, z1.b[1]\n" - "sdot z26.s, z10.b, z2.b[1]\n" - "sdot z30.s, z10.b, z3.b[1]\n" + "sdot z21.s, z6.b, z1.b[1]\n" + "sdot z25.s, z6.b, z2.b[1]\n" + "sdot z29.s, z6.b, z3.b[1]\n" + "sdot z18.s, z5.b, z0.b[1]\n" + "sdot z22.s, z5.b, z1.b[1]\n" + "sdot z26.s, z5.b, z2.b[1]\n" + "sdot z30.s, z5.b, z3.b[1]\n" "sdot z19.s, z4.b, z0.b[1]\n" "sdot z23.s, z4.b, z1.b[1]\n" "sdot z27.s, z4.b, z2.b[1]\n" "sdot z31.s, z4.b, z3.b[1]\n" "ble 52f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z20.s, z5.b, z1.b[2]\n" - "sdot z24.s, z5.b, z2.b[2]\n" - "sdot z28.s, z5.b, z3.b[2]\n" + "sdot z16.s, z7.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z20.s, z7.b, z1.b[2]\n" + "sdot z24.s, z7.b, z2.b[2]\n" + "sdot z28.s, z7.b, z3.b[2]\n" "sdot z17.s, z6.b, z0.b[2]\n" "addvl x28, x28, #4\n" "sdot z21.s, z6.b, z1.b[2]\n" "sdot z25.s, z6.b, z2.b[2]\n" "sdot z29.s, z6.b, z3.b[2]\n" - "sdot z18.s, z7.b, z0.b[2]\n" - "sdot z22.s, z7.b, z1.b[2]\n" - "sdot z26.s, z7.b, z2.b[2]\n" - "sdot z30.s, z7.b, z3.b[2]\n" - "sdot z19.s, z8.b, z0.b[2]\n" - "sdot z23.s, z8.b, z1.b[2]\n" - "sdot z27.s, z8.b, z2.b[2]\n" - "sdot z31.s, z8.b, z3.b[2]\n" + "sdot z18.s, z5.b, z0.b[2]\n" + "sdot z22.s, z5.b, z1.b[2]\n" + "sdot z26.s, z5.b, z2.b[2]\n" + "sdot z30.s, z5.b, z3.b[2]\n" + "sdot z19.s, z4.b, z0.b[2]\n" + "sdot z23.s, z4.b, z1.b[2]\n" + "sdot z27.s, z4.b, z2.b[2]\n" + "sdot z31.s, z4.b, z3.b[2]\n" "ble 52f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "sdot z16.s, z9.b, z0.b[3]\n" - "sdot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "sdot z24.s, z9.b, z2.b[3]\n" - "sdot z28.s, z9.b, z3.b[3]\n" - "sdot z17.s, z10.b, z0.b[3]\n" - "sdot z21.s, z10.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "sdot z16.s, z7.b, z0.b[3]\n" + "sdot z20.s, z7.b, z1.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "sdot z24.s, z7.b, z2.b[3]\n" + "sdot z28.s, z7.b, z3.b[3]\n" + "sdot z17.s, z6.b, z0.b[3]\n" + "sdot z21.s, z6.b, z1.b[3]\n" "addvl x28, x28, #4\n" - "sdot z25.s, z10.b, z2.b[3]\n" - "sdot z29.s, z10.b, z3.b[3]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z22.s, z4.b, z1.b[3]\n" - "sdot z26.s, z4.b, z2.b[3]\n" - "sdot z30.s, z4.b, z3.b[3]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z23.s, z5.b, z1.b[3]\n" - "sdot z27.s, z5.b, z2.b[3]\n" - "sdot z31.s, z5.b, z3.b[3]\n" + "sdot z25.s, z6.b, z2.b[3]\n" + "sdot z29.s, z6.b, z3.b[3]\n" + "sdot z18.s, z5.b, z0.b[3]\n" + "sdot z22.s, z5.b, z1.b[3]\n" + "sdot z26.s, z5.b, z2.b[3]\n" + "sdot z30.s, z5.b, z3.b[3]\n" + "sdot z19.s, z4.b, z0.b[3]\n" + "sdot z23.s, z4.b, z1.b[3]\n" + "sdot z27.s, z4.b, z2.b[3]\n" + "sdot z31.s, z4.b, z3.b[3]\n" "52:" // Height 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 53f\n" "sdot z11.s, z0.b, z15.b\n" @@ -1265,7 +1265,7 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" "saddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "saddv d12, p0, z12.s\n" @@ -1273,28 +1273,28 @@ void sve_hybrid_s8qa_dot_4x4VL ( "mov z12.s, z12.s[0]\n" "mov z13.s, z13.s[0]\n" "saddv d14, p0, z14.s\n" - "neg z4.s, p2/M, z4.s\n" + "neg z0.s, p2/M, z0.s\n" "mov z14.s, z14.s[0]\n" - "mul z11.s, p2/M, z11.s, z4.s\n" - "mul z12.s, p2/M, z12.s, z4.s\n" - "mul z13.s, p2/M, z13.s, z4.s\n" - "mul z14.s, p2/M, z14.s, z4.s\n" + "mul z11.s, p2/M, z11.s, z0.s\n" + "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z13.s, p2/M, z13.s, z0.s\n" + "mul z14.s, p2/M, z14.s, z0.s\n" "54:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" @@ -1305,174 +1305,174 @@ void sve_hybrid_s8qa_dot_4x4VL ( "add z29.s, z29.s, z14.s\n" "add z30.s, z30.s, z14.s\n" "add z31.s, z31.s, z14.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z1.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z0.s\n" + "add z18.s, z18.s, z3.s\n" + "add z19.s, z19.s, z2.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z0.s\n" + "add z22.s, z22.s, z3.s\n" + "add z23.s, z23.s, z2.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z0.s\n" + "add z26.s, z26.s, z3.s\n" + "add z27.s, z27.s, z2.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z0.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z2.s\n" - "add z31.s, z31.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" - ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" - ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" - ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + "add z30.s, z30.s, z3.s\n" + "add z31.s, z31.s, z2.s\n" + ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" + ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" + ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" + ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" + ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" + ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" + ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" + ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" + ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" + ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" + ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" + ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" + ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" + ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" + ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" + ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" "tbz %x[flags], #5, 55f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" + "and z2.d, z16.d, z0.d\n" + "and z1.d, z17.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z16.s, z16.s, z2.s\n" + "sqadd z17.s, z17.s, z1.s\n" + "and z7.d, z18.d, z0.d\n" + "and z6.d, z19.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z4.d, z21.d, z0.d\n" + "and z3.d, z22.d, z0.d\n" + "and z2.d, z23.d, z0.d\n" + "and z1.d, z24.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "and z9.d, z28.d, z0.d\n" - "and z10.d, z29.d, z0.d\n" - "and z4.d, z30.d, z0.d\n" - "and z5.d, z31.d, z0.d\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z18.s, z18.s, z7.s\n" + "sqadd z19.s, z19.s, z6.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z4.s\n" + "sqadd z22.s, z22.s, z3.s\n" + "sqadd z23.s, z23.s, z2.s\n" + "sqadd z24.s, z24.s, z1.s\n" + "and z7.d, z25.d, z0.d\n" + "and z6.d, z26.d, z0.d\n" + "and z5.d, z27.d, z0.d\n" + "and z4.d, z28.d, z0.d\n" + "and z3.d, z29.d, z0.d\n" + "and z2.d, z30.d, z0.d\n" + "and z1.d, z31.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" - "sqadd z28.s, z28.s, z9.s\n" - "sqadd z29.s, z29.s, z10.s\n" - "sqadd z30.s, z30.s, z4.s\n" - "sqadd z31.s, z31.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z25.s, z25.s, z7.s\n" + "sqadd z26.s, z26.s, z6.s\n" + "sqadd z27.s, z27.s, z5.s\n" + "sqadd z28.s, z28.s, z4.s\n" + "sqadd z29.s, z29.s, z3.s\n" + "sqadd z30.s, z30.s, z2.s\n" + "sqadd z31.s, z31.s, z1.s\n" "55:" // Height 4: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z2.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z2.s\n" + "add z20.s, z20.s, z2.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z2.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z23.s, z23.s, z2.s\n" + "add z24.s, z24.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z27.s, z27.s, z4.s\n" - "add z28.s, z28.s, z4.s\n" + "add z27.s, z27.s, z2.s\n" + "add z28.s, z28.s, z2.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" - "add z29.s, z29.s, z4.s\n" - "add z30.s, z30.s, z4.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z2.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z31.s, z31.s, z4.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add z31.s, z31.s, z2.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z1.s\n" + "smin z17.s, p2/M, z17.s, z1.s\n" + "smin z18.s, p2/M, z18.s, z1.s\n" + "smin z19.s, p2/M, z19.s, z1.s\n" + "smin z20.s, p2/M, z20.s, z1.s\n" + "smin z21.s, p2/M, z21.s, z1.s\n" + "smin z22.s, p2/M, z22.s, z1.s\n" + "smin z23.s, p2/M, z23.s, z1.s\n" + "smin z24.s, p2/M, z24.s, z1.s\n" + "smin z25.s, p2/M, z25.s, z1.s\n" + "smin z26.s, p2/M, z26.s, z1.s\n" + "smin z27.s, p2/M, z27.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z1.s\n" + "smin z29.s, p2/M, z29.s, z1.s\n" + "smin z30.s, p2/M, z30.s, z1.s\n" + "smin z31.s, p2/M, z31.s, z1.s\n" + "smax z16.s, p2/M, z16.s, z0.s\n" + "smax z17.s, p2/M, z17.s, z0.s\n" + "smax z18.s, p2/M, z18.s, z0.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z0.s\n" + "smax z20.s, p2/M, z20.s, z0.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z0.s\n" + "smax z22.s, p2/M, z22.s, z0.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z0.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z0.s\n" "uzp1 z24.h, z24.h, z25.h\n" "st1b { z20.b }, p1, [x23]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "smax z29.s, p2/M, z29.s, z5.s\n" - "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z28.s, p2/M, z28.s, z0.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "smax z29.s, p2/M, z29.s, z0.s\n" + "smax z30.s, p2/M, z30.s, z0.s\n" "uzp1 z28.h, z28.h, z29.h\n" "st1b { z24.b }, p1, [x22]\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z28.b, z28.b, z29.b\n" + "smax z31.s, p2/M, z31.s, z0.s\n" + "uzp1 z16.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z16.b\n" "st1b { z28.b }, p1, [x21]\n" "addvl x27, x27, #1\n" "56:" // Height 4: Writeback done @@ -1491,7 +1491,6 @@ void sve_hybrid_s8qa_dot_4x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "58:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1499,4 +1498,4 @@ void sve_hybrid_s8qa_dot_4x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp index 9681505e8c..ae922e9743 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp index 626a06b26b..e0628364f4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp @@ -108,11 +108,11 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -125,41 +125,41 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn2 z1.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n" + ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n" + ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n" + ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n" + ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" + ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" + ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" + ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" "add x24, x24, #0x10\n" "tbnz %x[flags], #31, 8f\n" "sdot z11.s, z0.b, z15.b\n" @@ -171,43 +171,43 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn1 z0.d, z1.d, z27.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z27.d\n" + ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n" + ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n" + ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n" + ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n" + ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n" "addvl x28, x28, #8\n" "ble 10f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n" + ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n" + ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" + ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" "addvl x28, x28, #8\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" @@ -224,74 +224,74 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp1 z19.d, z19.d, z23.d\n" "mov z23.d, z16.d\n" "tbnz %x[flags], #31, 12f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z1.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "neg z1.s, p2/M, z1.s\n" + "neg z16.s, p2/M, z16.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z1.s\n" + "mul z11.s, p2/M, z11.s, z16.s\n" "12:" // Height 1: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x10]\n" + "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add z23.s, z23.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + "add z23.s, z23.s, z22.s\n" + "add z17.s, z17.s, z21.s\n" + "add z18.s, z18.s, z20.s\n" + "add z19.s, z19.s, z16.s\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n" + ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n" "addvl x10, x10, #4\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n" + ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n" "tbz %x[flags], #5, 13f\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z22.d, z23.d, z0.d\n" + "and z21.d, z17.d, z0.d\n" + "and z20.d, z18.d, z0.d\n" + "and z16.d, z19.d, z0.d\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z23.s, z23.s, z22.s\n" + "sqadd z17.s, z17.s, z21.s\n" + "sqadd z18.s, z18.s, z20.s\n" + "sqadd z19.s, z19.s, z16.s\n" "13:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z16.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z16.s\n" + "add z18.s, z18.s, z16.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z19.s, z19.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z16.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z20.s\n" + "smin z17.s, p2/M, z17.s, z20.s\n" + "smin z18.s, p2/M, z18.s, z20.s\n" + "smin z19.s, p2/M, z19.s, z20.s\n" + "smax z23.s, p2/M, z23.s, z16.s\n" + "smax z17.s, p2/M, z17.s, z16.s\n" + "smax z18.s, p2/M, z18.s, z16.s\n" "uzp1 z23.h, z23.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "uzp1 z17.h, z18.h, z19.h\n" - "uzp1 z23.b, z23.b, z17.b\n" + "smax z19.s, p2/M, z19.s, z16.s\n" + "uzp1 z16.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z16.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" "14:" // Height 1: Writeback done @@ -324,12 +324,12 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -337,49 +337,49 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "b 20f\n" "19:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "20:" // Height 2: input setup done "cmp x25, #0x10\n" "ble 23f\n" "21:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1rqb { z26.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn2 z1.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45199814 // smmla z20.s, z0.b, z25.b\n" + ".inst 0x45189811 // smmla z17.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x451a9815 // smmla z21.s, z0.b, z26.b\n" + ".inst 0x45199812 // smmla z18.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + ".inst 0x45189816 // smmla z22.s, z0.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x451a9813 // smmla z19.s, z0.b, z26.b\n" + ".inst 0x45199817 // smmla z23.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x451a9834 // smmla z20.s, z1.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n" + ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x451b9832 // smmla z18.s, z1.b, z27.b\n" + ".inst 0x451a9836 // smmla z22.s, z1.b, z26.b\n" + ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" + ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "tbnz %x[flags], #31, 22f\n" @@ -392,44 +392,44 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z27.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189810 // smmla z16.s, z0.b, z24.b\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z27.d\n" + ".inst 0x451a9814 // smmla z20.s, z0.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45199811 // smmla z17.s, z0.b, z25.b\n" + ".inst 0x45189815 // smmla z21.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x451b9812 // smmla z18.s, z0.b, z27.b\n" + ".inst 0x451a9816 // smmla z22.s, z0.b, z26.b\n" + ".inst 0x45199813 // smmla z19.s, z0.b, z25.b\n" + ".inst 0x45189817 // smmla z23.s, z0.b, z24.b\n" "addvl x28, x28, #8\n" "ble 24f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45189830 // smmla z16.s, z1.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45189834 // smmla z20.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45199831 // smmla z17.s, z1.b, z25.b\n" + ".inst 0x45189835 // smmla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45199832 // smmla z18.s, z1.b, z25.b\n" + ".inst 0x45189836 // smmla z22.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45199833 // smmla z19.s, z1.b, z25.b\n" + ".inst 0x45189837 // smmla z23.s, z1.b, z24.b\n" "addvl x28, x28, #8\n" "24:" // Height 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 25f\n" @@ -440,133 +440,133 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 18b\n" - "uzp1 z7.d, z16.d, z20.d\n" + "uzp1 z24.d, z16.d, z20.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x22, x27, x20\n" + "add x23, x27, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "mov z23.d, z7.d\n" + "mov z23.d, z24.d\n" "tbnz %x[flags], #31, 26f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z2.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "neg z2.s, p2/M, z2.s\n" + "neg z24.s, p2/M, z24.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z2.s\n" - "mul z12.s, p2/M, z12.s, z2.s\n" + "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z12.s, p2/M, z12.s, z24.s\n" "26:" // Height 2: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10]\n" + "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add z23.s, z23.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z23.s, z23.s, z28.s\n" + "add z20.s, z20.s, z27.s\n" "addvl x10, x10, #4\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "add z21.s, z21.s, z26.s\n" + "add z22.s, z22.s, z25.s\n" + "add z16.s, z16.s, z28.s\n" + "add z17.s, z17.s, z27.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z18.s, z18.s, z26.s\n" + "add z19.s, z19.s, z25.s\n" + ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" + ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" + ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" + ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" + ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" + ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" + ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" + ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" "tbz %x[flags], #5, 27f\n" - "and z4.d, z23.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "and z5.d, z20.d, z0.d\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" + "and z24.d, z23.d, z0.d\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z23.s, z23.s, z24.s\n" + "and z30.d, z20.d, z0.d\n" + "and z29.d, z21.d, z0.d\n" + "and z28.d, z22.d, z0.d\n" + "and z27.d, z16.d, z0.d\n" + "and z26.d, z17.d, z0.d\n" + "and z25.d, z18.d, z0.d\n" + "and z24.d, z19.d, z0.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z20.s, z20.s, z30.s\n" + "sqadd z21.s, z21.s, z29.s\n" + "sqadd z22.s, z22.s, z28.s\n" + "sqadd z16.s, z16.s, z27.s\n" + "sqadd z17.s, z17.s, z26.s\n" + "sqadd z18.s, z18.s, z25.s\n" + "sqadd z19.s, z19.s, z24.s\n" "27:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z24.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z24.s\n" + "add z21.s, z21.s, z24.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z24.s\n" + "add z16.s, z16.s, z24.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z24.s\n" + "add z18.s, z18.s, z24.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z19.s, z19.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z24.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z25.s\n" + "smin z20.s, p2/M, z20.s, z25.s\n" + "smin z21.s, p2/M, z21.s, z25.s\n" + "smin z22.s, p2/M, z22.s, z25.s\n" + "smin z16.s, p2/M, z16.s, z25.s\n" + "smin z17.s, p2/M, z17.s, z25.s\n" + "smin z18.s, p2/M, z18.s, z25.s\n" + "smin z19.s, p2/M, z19.s, z25.s\n" + "smax z23.s, p2/M, z23.s, z24.s\n" + "smax z20.s, p2/M, z20.s, z24.s\n" + "smax z21.s, p2/M, z21.s, z24.s\n" "uzp1 z23.h, z23.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z24.s\n" + "smax z16.s, p2/M, z16.s, z24.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z23.b, z23.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z24.s\n" + "smax z18.s, p2/M, z18.s, z24.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z23.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z24.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x22]\n" + "st1b { z16.b }, p1, [x23]\n" "addvl x27, x27, #1\n" "28:" // Height 2: Writeback done "decw x9, ALL, MUL #4\n" @@ -607,13 +607,13 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -622,8 +622,8 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "34:" // Height 3: input setup done "cmp x25, #0x10\n" "ble 37f\n" @@ -634,60 +634,60 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x22]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn1 z2.d, z3.d, z5.d\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n" + ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45099815 // smmla z21.s, z0.b, z9.b\n" + ".inst 0x4509985d // smmla z29.s, z2.b, z9.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n" + ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n" + ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n" + ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" "add x23, x23, #0x10\n" ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" + ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" + ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" + ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" + ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" + ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" + ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" + ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" + ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" + ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" - ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" + ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" "tbnz %x[flags], #31, 36f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" @@ -708,56 +708,56 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "trn1 z2.d, z3.d, z4.d\n" "trn2 z3.d, z3.d, z4.d\n" ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n" + ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45099811 // smmla z17.s, z0.b, z9.b\n" + ".inst 0x45099859 // smmla z25.s, z2.b, z9.b\n" ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" "addvl x28, x28, #8\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n" + ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n" + ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n" + ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n" + ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n" + ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n" + ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n" + ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n" "ble 38f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" + ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n" + ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n" + ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" + ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" + ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" + ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" - ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" + ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" "38:" // Height 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 39f\n" "sdot z11.s, z0.b, z15.b\n" @@ -770,12 +770,12 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "cmp x26, x20\n" "bne 32b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z16.d, z20.d\n" - "add x22, x27, x20\n" + "uzp1 z0.d, z16.d, z20.d\n" + "add x23, x27, x20\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" - "add x21, x22, x20\n" + "add x22, x23, x20\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" @@ -784,170 +784,170 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" + "mov z31.d, z0.d\n" "tbnz %x[flags], #31, 40f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z3.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "neg z3.s, p2/M, z3.s\n" + "neg z23.s, p2/M, z23.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z11.s, p2/M, z11.s, z23.s\n" "mov z13.s, z13.s[0]\n" - "mul z12.s, p2/M, z12.s, z3.s\n" - "mul z13.s, p2/M, z13.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z23.s\n" + "mul z13.s, p2/M, z13.s, z23.s\n" "40:" // Height 3: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" "addvl x10, x10, #4\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" + "add z20.s, z20.s, z30.s\n" + "add z21.s, z21.s, z29.s\n" + "add z22.s, z22.s, z28.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "add z17.s, z17.s, z30.s\n" + "add z18.s, z18.s, z29.s\n" + "add z19.s, z19.s, z28.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z25.s, z25.s, z30.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z26.s, z26.s, z29.s\n" + "add z27.s, z27.s, z28.s\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n" + ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n" + ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n" + ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n" + ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n" + ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n" "tbz %x[flags], #5, 41f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" + "and z1.d, z31.d, z0.d\n" + "and z30.d, z20.d, z0.d\n" + "and z29.d, z21.d, z0.d\n" + "and z28.d, z22.d, z0.d\n" + "and z23.d, z16.d, z0.d\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z31.s, z31.s, z1.s\n" + "sqadd z20.s, z20.s, z30.s\n" + "sqadd z21.s, z21.s, z29.s\n" + "sqadd z22.s, z22.s, z28.s\n" + "sqadd z16.s, z16.s, z23.s\n" + "and z3.d, z17.d, z0.d\n" + "and z2.d, z18.d, z0.d\n" + "and z1.d, z19.d, z0.d\n" + "and z30.d, z24.d, z0.d\n" + "and z29.d, z25.d, z0.d\n" + "and z28.d, z26.d, z0.d\n" + "and z23.d, z27.d, z0.d\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z17.s, z17.s, z3.s\n" + "sqadd z18.s, z18.s, z2.s\n" + "sqadd z19.s, z19.s, z1.s\n" + "sqadd z24.s, z24.s, z30.s\n" + "sqadd z25.s, z25.s, z29.s\n" + "sqadd z26.s, z26.s, z28.s\n" + "sqadd z27.s, z27.s, z23.s\n" "41:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add z31.s, z31.s, z4.s\n" + "add z31.s, z31.s, z23.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z23.s\n" + "add z21.s, z21.s, z23.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z23.s\n" + "add z16.s, z16.s, z23.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z23.s\n" + "add z18.s, z18.s, z23.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z19.s, z19.s, z23.s\n" + "add z24.s, z24.s, z23.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z23.s\n" + "add z26.s, z26.s, z23.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z27.s, z27.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z23.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z28.s\n" + "smin z20.s, p2/M, z20.s, z28.s\n" + "smin z21.s, p2/M, z21.s, z28.s\n" + "smin z22.s, p2/M, z22.s, z28.s\n" + "smin z16.s, p2/M, z16.s, z28.s\n" + "smin z17.s, p2/M, z17.s, z28.s\n" + "smin z18.s, p2/M, z18.s, z28.s\n" + "smin z19.s, p2/M, z19.s, z28.s\n" + "smin z24.s, p2/M, z24.s, z28.s\n" + "smin z25.s, p2/M, z25.s, z28.s\n" + "smin z26.s, p2/M, z26.s, z28.s\n" + "smin z27.s, p2/M, z27.s, z28.s\n" + "smax z31.s, p2/M, z31.s, z23.s\n" + "smax z20.s, p2/M, z20.s, z23.s\n" + "smax z21.s, p2/M, z21.s, z23.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z23.s\n" + "smax z16.s, p2/M, z16.s, z23.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z23.s\n" + "smax z18.s, p2/M, z18.s, z23.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z31.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z23.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z23.s\n" + "smax z26.s, p2/M, z26.s, z23.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x21]\n" + "st1b { z16.b }, p1, [x23]\n" + "smax z27.s, p2/M, z27.s, z23.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x22]\n" "addvl x27, x27, #1\n" "42:" // Height 3: Writeback done "decw x9, ALL, MUL #4\n" @@ -992,14 +992,14 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1009,9 +1009,9 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "b 48f\n" "47:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "48:" // Height 4: input setup done "cmp x25, #0x10\n" "ble 51f\n" @@ -1021,63 +1021,63 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "ld1rqb { z2.b }, p0/Z, [x23]\n" "trn1 z0.d, z1.d, z2.d\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z4.b }, p0/Z, [x21]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" - ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" + "trn1 z2.d, z3.d, z5.d\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n" + ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45049814 // smmla z20.s, z0.b, z4.b\n" + ".inst 0x4504985c // smmla z28.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45059811 // smmla z17.s, z0.b, z5.b\n" + ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45049815 // smmla z21.s, z0.b, z4.b\n" + ".inst 0x4504985d // smmla z29.s, z2.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" - ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45089812 // smmla z18.s, z0.b, z8.b\n" + ".inst 0x4508985a // smmla z26.s, z2.b, z8.b\n" + ".inst 0x45079816 // smmla z22.s, z0.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x4507985e // smmla z30.s, z2.b, z7.b\n" + ".inst 0x45069813 // smmla z19.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x4506985b // smmla z27.s, z2.b, z6.b\n" ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" "add x24, x24, #0x10\n" ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" + ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x450a9834 // smmla z20.s, z1.b, z10.b\n" "add x22, x22, #0x10\n" - ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" + ".inst 0x450a987c // smmla z28.s, z3.b, z10.b\n" + ".inst 0x45099831 // smmla z17.s, z1.b, z9.b\n" "add x21, x21, #0x10\n" - ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45099879 // smmla z25.s, z3.b, z9.b\n" + ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" + ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" + ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" + ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" + ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" - ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" + ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" "tbnz %x[flags], #31, 50f\n" "sdot z11.s, z0.b, z15.b\n" "sdot z13.s, z2.b, z15.b\n" @@ -1093,62 +1093,62 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "ld1rqb { z2.b }, p0/Z, [x23]\n" "trn1 z0.d, z1.d, z2.d\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z4.b }, p0/Z, [x21]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45059810 // smmla z16.s, z0.b, z5.b\n" - ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "trn1 z2.d, z3.d, z5.d\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45049810 // smmla z16.s, z0.b, z4.b\n" + ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" "subs x25, x25, #0x8\n" - ".inst 0x45069814 // smmla z20.s, z0.b, z6.b\n" + ".inst 0x45059814 // smmla z20.s, z0.b, z5.b\n" "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - ".inst 0x4506985c // smmla z28.s, z2.b, z6.b\n" - ".inst 0x45079811 // smmla z17.s, z0.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x4505985c // smmla z28.s, z2.b, z5.b\n" + ".inst 0x45049811 // smmla z17.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n" ".inst 0x45089815 // smmla z21.s, z0.b, z8.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" ".inst 0x4508985d // smmla z29.s, z2.b, z8.b\n" - ".inst 0x45099812 // smmla z18.s, z0.b, z9.b\n" + ".inst 0x45079812 // smmla z18.s, z0.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x4509985a // smmla z26.s, z2.b, z9.b\n" - ".inst 0x450a9816 // smmla z22.s, z0.b, z10.b\n" - ".inst 0x450a985e // smmla z30.s, z2.b, z10.b\n" - ".inst 0x45049813 // smmla z19.s, z0.b, z4.b\n" - ".inst 0x4504985b // smmla z27.s, z2.b, z4.b\n" - ".inst 0x45059817 // smmla z23.s, z0.b, z5.b\n" - ".inst 0x4505985f // smmla z31.s, z2.b, z5.b\n" + ".inst 0x4507985a // smmla z26.s, z2.b, z7.b\n" + ".inst 0x45069816 // smmla z22.s, z0.b, z6.b\n" + ".inst 0x4506985e // smmla z30.s, z2.b, z6.b\n" + ".inst 0x45059813 // smmla z19.s, z0.b, z5.b\n" + ".inst 0x4505985b // smmla z27.s, z2.b, z5.b\n" + ".inst 0x45049817 // smmla z23.s, z0.b, z4.b\n" + ".inst 0x4504985f // smmla z31.s, z2.b, z4.b\n" "ble 52f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - ".inst 0x45069878 // smmla z24.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45079834 // smmla z20.s, z1.b, z7.b\n" - ".inst 0x4507987c // smmla z28.s, z3.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45089831 // smmla z17.s, z1.b, z8.b\n" - ".inst 0x45089879 // smmla z25.s, z3.b, z8.b\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" + ".inst 0x45049878 // smmla z24.s, z3.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45059834 // smmla z20.s, z1.b, z5.b\n" + ".inst 0x4505987c // smmla z28.s, z3.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45049831 // smmla z17.s, z1.b, z4.b\n" + ".inst 0x45049879 // smmla z25.s, z3.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45099835 // smmla z21.s, z1.b, z9.b\n" - ".inst 0x4509987d // smmla z29.s, z3.b, z9.b\n" - ".inst 0x450a9832 // smmla z18.s, z1.b, z10.b\n" - ".inst 0x450a987a // smmla z26.s, z3.b, z10.b\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45089835 // smmla z21.s, z1.b, z8.b\n" + ".inst 0x4508987d // smmla z29.s, z3.b, z8.b\n" + ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + ".inst 0x4507987a // smmla z26.s, z3.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x45049836 // smmla z22.s, z1.b, z4.b\n" - ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n" + ".inst 0x45069836 // smmla z22.s, z1.b, z6.b\n" + ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n" ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" - ".inst 0x45069837 // smmla z23.s, z1.b, z6.b\n" - ".inst 0x4506987f // smmla z31.s, z3.b, z6.b\n" + ".inst 0x45049837 // smmla z23.s, z1.b, z4.b\n" + ".inst 0x4504987f // smmla z31.s, z3.b, z4.b\n" "52:" // Height 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 53f\n" "sdot z11.s, z0.b, z15.b\n" @@ -1161,12 +1161,12 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "cmp x26, x20\n" "bne 46b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z16.d, z20.d\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "uzp1 z0.d, z16.d, z20.d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" - "add x20, x21, x20\n" + "add x21, x22, x20\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" @@ -1180,38 +1180,38 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" + "mov z31.d, z0.d\n" "tbnz %x[flags], #31, 54f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "neg z4.s, p2/M, z4.s\n" + "neg z0.s, p2/M, z0.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z11.s, p2/M, z11.s, z0.s\n" "mov z14.s, z13.s[3]\n" "mov z13.s, z13.s[0]\n" - "mul z12.s, p2/M, z12.s, z4.s\n" - "mul z13.s, p2/M, z13.s, z4.s\n" - "mul z14.s, p2/M, z14.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z13.s, p2/M, z13.s, z0.s\n" + "mul z14.s, p2/M, z14.s, z0.s\n" "54:" // Height 4: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add z23.s, z23.s, z13.s\n" "add z28.s, z28.s, z13.s\n" "addvl x10, x10, #4\n" @@ -1221,175 +1221,175 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "add z25.s, z25.s, z14.s\n" "add z26.s, z26.s, z14.s\n" "add z27.s, z27.s, z14.s\n" - "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z23.s, z23.s, z0.s\n" - "add z28.s, z28.s, z1.s\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" - ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" - ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z3.s\n" + "add z22.s, z22.s, z2.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z0.s\n" + "add z18.s, z18.s, z3.s\n" + "add z19.s, z19.s, z2.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z3.s\n" + "add z30.s, z30.s, z2.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z26.s, z26.s, z3.s\n" + "add z27.s, z27.s, z2.s\n" + ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" + ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" + ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" + ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" + ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" + ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" + ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" + ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" + ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" + ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" + ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" + ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" + ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" + ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" + ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" + ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" "tbz %x[flags], #5, 55f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "and z5.d, z23.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" + "and z2.d, z31.d, z0.d\n" + "and z1.d, z20.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z31.s, z31.s, z2.s\n" + "sqadd z20.s, z20.s, z1.s\n" + "and z7.d, z21.d, z0.d\n" + "and z6.d, z22.d, z0.d\n" + "and z5.d, z16.d, z0.d\n" + "and z4.d, z17.d, z0.d\n" + "and z3.d, z18.d, z0.d\n" + "and z2.d, z19.d, z0.d\n" + "and z1.d, z23.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "sqadd z23.s, z23.s, z5.s\n" - "and z6.d, z28.d, z0.d\n" - "and z7.d, z29.d, z0.d\n" - "and z8.d, z30.d, z0.d\n" - "and z9.d, z24.d, z0.d\n" - "and z10.d, z25.d, z0.d\n" - "and z4.d, z26.d, z0.d\n" - "and z5.d, z27.d, z0.d\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z21.s, z21.s, z7.s\n" + "sqadd z22.s, z22.s, z6.s\n" + "sqadd z16.s, z16.s, z5.s\n" + "sqadd z17.s, z17.s, z4.s\n" + "sqadd z18.s, z18.s, z3.s\n" + "sqadd z19.s, z19.s, z2.s\n" + "sqadd z23.s, z23.s, z1.s\n" + "and z7.d, z28.d, z0.d\n" + "and z6.d, z29.d, z0.d\n" + "and z5.d, z30.d, z0.d\n" + "and z4.d, z24.d, z0.d\n" + "and z3.d, z25.d, z0.d\n" + "and z2.d, z26.d, z0.d\n" + "and z1.d, z27.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "sqadd z28.s, z28.s, z6.s\n" - "sqadd z29.s, z29.s, z7.s\n" - "sqadd z30.s, z30.s, z8.s\n" - "sqadd z24.s, z24.s, z9.s\n" - "sqadd z25.s, z25.s, z10.s\n" - "sqadd z26.s, z26.s, z4.s\n" - "sqadd z27.s, z27.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z28.s, z28.s, z7.s\n" + "sqadd z29.s, z29.s, z6.s\n" + "sqadd z30.s, z30.s, z5.s\n" + "sqadd z24.s, z24.s, z4.s\n" + "sqadd z25.s, z25.s, z3.s\n" + "sqadd z26.s, z26.s, z2.s\n" + "sqadd z27.s, z27.s, z1.s\n" "55:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add z31.s, z31.s, z4.s\n" + "add z31.s, z31.s, z2.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z2.s\n" + "add z21.s, z21.s, z2.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z2.s\n" + "add z16.s, z16.s, z2.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z23.s, z23.s, z4.s\n" + "add z19.s, z19.s, z2.s\n" + "add z23.s, z23.s, z2.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" - "add z28.s, z28.s, z4.s\n" - "add z29.s, z29.s, z4.s\n" + "add z28.s, z28.s, z2.s\n" + "add z29.s, z29.s, z2.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z30.s, z30.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z30.s, z30.s, z2.s\n" + "add z24.s, z24.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z27.s, z27.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z2.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z1.s\n" + "smin z20.s, p2/M, z20.s, z1.s\n" + "smin z21.s, p2/M, z21.s, z1.s\n" + "smin z22.s, p2/M, z22.s, z1.s\n" + "smin z16.s, p2/M, z16.s, z1.s\n" + "smin z17.s, p2/M, z17.s, z1.s\n" + "smin z18.s, p2/M, z18.s, z1.s\n" + "smin z19.s, p2/M, z19.s, z1.s\n" + "smin z23.s, p2/M, z23.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z1.s\n" + "smin z29.s, p2/M, z29.s, z1.s\n" + "smin z30.s, p2/M, z30.s, z1.s\n" + "smin z24.s, p2/M, z24.s, z1.s\n" + "smin z25.s, p2/M, z25.s, z1.s\n" + "smin z26.s, p2/M, z26.s, z1.s\n" + "smin z27.s, p2/M, z27.s, z1.s\n" + "smax z31.s, p2/M, z31.s, z0.s\n" + "smax z20.s, p2/M, z20.s, z0.s\n" + "smax z21.s, p2/M, z21.s, z0.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z0.s\n" + "smax z16.s, p2/M, z16.s, z0.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z0.s\n" + "smax z18.s, p2/M, z18.s, z0.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z31.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z0.s\n" + "smax z23.s, p2/M, z23.s, z0.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z0.s\n" "uzp1 z23.h, z23.h, z28.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z30.s, p2/M, z30.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z28.h, z29.h, z30.h\n" - "uzp1 z23.b, z23.b, z28.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "st1b { z16.b }, p1, [x23]\n" + "smax z30.s, p2/M, z30.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z0.s\n" + "uzp1 z16.h, z29.h, z30.h\n" + "uzp1 z23.b, z23.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z0.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z23.b }, p1, [x21]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x20]\n" + "st1b { z23.b }, p1, [x22]\n" + "smax z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x21]\n" "addvl x27, x27, #1\n" "56:" // Height 4: Writeback done "decw x9, ALL, MUL #4\n" @@ -1407,7 +1407,6 @@ void sve_hybrid_s8qa_mmla_4x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "58:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1415,4 +1414,4 @@ void sve_hybrid_s8qa_mmla_4x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp index dad04c81e8..056ae7a616 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp index 1e71806838..c28717a37e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp @@ -113,11 +113,11 @@ void sve_hybrid_s8qs_dot_6x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -130,101 +130,101 @@ void sve_hybrid_s8qs_dot_6x4VL ( "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9]\n" + "sdot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z10.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z11.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #4, MUL VL]\n" + "sdot z8.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z10.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z11.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" "add x26, x26, #0x10\n" "bgt 7b\n" "8:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9]\n" + "sdot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[0]\n" + "sdot z11.s, z16.b, z0.b[0]\n" "addvl x9, x9, #4\n" "ble 9f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[1]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z10.s, z17.b, z0.b[1]\n" + "sdot z11.s, z16.b, z0.b[1]\n" "addvl x9, x9, #4\n" "ble 9f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" "addvl x9, x9, #4\n" "ble 9f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" "addvl x9, x9, #4\n" "9:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 4b\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" + "ld1w { z17.s }, p2/Z, [x14]\n" + "ld1w { z16.s }, p2/Z, [x14, #1, MUL VL]\n" + "add z8.s, z8.s, z17.s\n" + "add z9.s, z9.s, z16.s\n" + "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" + "add z10.s, z10.s, z17.s\n" + "add z11.s, z11.s, z16.s\n" "addvl x14, x14, #4\n" "tbz %x[flags], #4, 10f\n" "ld1w { z0.s }, p2/Z, [x12]\n" @@ -239,10 +239,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 11f\n" "10:" // Height 1: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -255,44 +255,44 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" "tbz %x[flags], #5, 12f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" + "and z19.d, z8.d, z0.d\n" + "and z18.d, z9.d, z1.d\n" + "and z17.d, z10.d, z2.d\n" + "and z16.d, z11.d, z3.d\n" + "asr z19.s, z19.s, #0x1f\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z8.s, z8.s, z19.s\n" + "sqadd z9.s, z9.s, z18.s\n" + "sqadd z10.s, z10.s, z17.s\n" + "sqadd z11.s, z11.s, z16.s\n" "12:" // Height 1: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z8.s, z8.s, z4.s\n" + "add z8.s, z8.s, z16.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z16.s\n" + "add z10.s, z10.s, z16.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z11.s, z11.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" + "add z11.s, z11.s, z16.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z17.s\n" + "smin z9.s, p2/M, z9.s, z17.s\n" + "smin z10.s, p2/M, z10.s, z17.s\n" + "smin z11.s, p2/M, z11.s, z17.s\n" + "smax z8.s, p2/M, z8.s, z16.s\n" + "smax z9.s, p2/M, z9.s, z16.s\n" + "smax z10.s, p2/M, z10.s, z16.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" + "smax z11.s, p2/M, z11.s, z16.s\n" + "uzp1 z16.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z16.b\n" "st1b { z8.b }, p1, [x11]\n" "addvl x11, x11, #1\n" "13:" // Height 1: Writeback done @@ -323,12 +323,12 @@ void sve_hybrid_s8qs_dot_6x4VL ( "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 18f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -336,150 +336,150 @@ void sve_hybrid_s8qs_dot_6x4VL ( "b 19f\n" "18:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "19:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 21f\n" "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[0]\n" + "sdot z12.s, z17.b, z0.b[0]\n" + "sdot z9.s, z16.b, z1.b[0]\n" + "sdot z13.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[0]\n" + "sdot z14.s, z17.b, z0.b[0]\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" "cmp x27, #0x10\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[0]\n" + "sdot z15.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" "add x26, x26, #0x10\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[1]\n" + "sdot z12.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" "add x25, x25, #0x10\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[1]\n" + "sdot z13.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z10.s, z17.b, z1.b[1]\n" + "sdot z14.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[1]\n" + "sdot z15.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[2]\n" + "sdot z12.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[2]\n" + "sdot z13.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[2]\n" + "sdot z14.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[2]\n" + "sdot z15.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[3]\n" + "sdot z12.s, z17.b, z0.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[3]\n" + "sdot z13.s, z16.b, z0.b[3]\n" + "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[3]\n" + "sdot z14.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z1.b[3]\n" + "sdot z15.s, z16.b, z0.b[3]\n" "bgt 20b\n" "21:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[0]\n" + "sdot z12.s, z17.b, z1.b[0]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "sdot z13.s, z16.b, z1.b[0]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[0]\n" + "sdot z14.s, z17.b, z1.b[0]\n" "addvl x9, x9, #4\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z11.s, z16.b, z0.b[0]\n" + "sdot z15.s, z16.b, z1.b[0]\n" "ble 22f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[1]\n" + "sdot z12.s, z17.b, z1.b[1]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "sdot z13.s, z16.b, z1.b[1]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z10.s, z17.b, z0.b[1]\n" + "sdot z14.s, z17.b, z1.b[1]\n" "addvl x9, x9, #4\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z11.s, z16.b, z0.b[1]\n" + "sdot z15.s, z16.b, z1.b[1]\n" "ble 22f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z12.s, z17.b, z1.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "sdot z13.s, z16.b, z1.b[2]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z14.s, z17.b, z1.b[2]\n" "addvl x9, x9, #4\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" + "sdot z15.s, z16.b, z1.b[2]\n" "ble 22f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z12.s, z17.b, z1.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "sdot z13.s, z16.b, z1.b[3]\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z14.s, z17.b, z1.b[3]\n" "addvl x9, x9, #4\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" + "sdot z15.s, z16.b, z1.b[3]\n" "22:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 17b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "add x24, x11, x20\n" - "add z8.s, z8.s, z0.s\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add z11.s, z11.s, z3.s\n" - "add z12.s, z12.s, z0.s\n" + "ld1w { z19.s }, p2/Z, [x14]\n" + "add x26, x11, x20\n" + "add z8.s, z8.s, z19.s\n" + "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" + "add z9.s, z9.s, z18.s\n" + "add z10.s, z10.s, z17.s\n" + "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" + "add z11.s, z11.s, z16.s\n" + "add z12.s, z12.s, z19.s\n" "addvl x14, x14, #4\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z2.s\n" - "add z15.s, z15.s, z3.s\n" + "add z13.s, z13.s, z18.s\n" + "add z14.s, z14.s, z17.s\n" + "add z15.s, z15.s, z16.s\n" "tbz %x[flags], #4, 23f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -493,10 +493,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 24f\n" "23:" // Height 2: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -513,77 +513,77 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n" ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n" "tbz %x[flags], #5, 25f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z12.d, z0.d\n" - "and z5.d, z13.d, z1.d\n" - "and z6.d, z14.d, z2.d\n" - "and z7.d, z15.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z12.s, z12.s, z4.s\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z6.s\n" - "sqadd z15.s, z15.s, z7.s\n" + "and z19.d, z8.d, z0.d\n" + "and z18.d, z9.d, z1.d\n" + "and z17.d, z10.d, z2.d\n" + "and z16.d, z11.d, z3.d\n" + "asr z19.s, z19.s, #0x1f\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z8.s, z8.s, z19.s\n" + "sqadd z9.s, z9.s, z18.s\n" + "sqadd z10.s, z10.s, z17.s\n" + "sqadd z11.s, z11.s, z16.s\n" + "and z19.d, z12.d, z0.d\n" + "and z18.d, z13.d, z1.d\n" + "and z17.d, z14.d, z2.d\n" + "and z16.d, z15.d, z3.d\n" + "asr z19.s, z19.s, #0x1f\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z12.s, z12.s, z19.s\n" + "sqadd z13.s, z13.s, z18.s\n" + "sqadd z14.s, z14.s, z17.s\n" + "sqadd z15.s, z15.s, z16.s\n" "25:" // Height 2: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z8.s, z8.s, z4.s\n" + "add z8.s, z8.s, z17.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z17.s\n" + "add z10.s, z10.s, z17.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z12.s, z12.s, z4.s\n" + "add z11.s, z11.s, z17.s\n" + "add z12.s, z12.s, z17.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z13.s, z13.s, z4.s\n" - "add z14.s, z14.s, z4.s\n" + "add z13.s, z13.s, z17.s\n" + "add z14.s, z14.s, z17.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z15.s, z15.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "add z15.s, z15.s, z17.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z16.s\n" + "smin z9.s, p2/M, z9.s, z16.s\n" + "smin z10.s, p2/M, z10.s, z16.s\n" + "smin z11.s, p2/M, z11.s, z16.s\n" + "smin z12.s, p2/M, z12.s, z16.s\n" + "smin z13.s, p2/M, z13.s, z16.s\n" + "smin z14.s, p2/M, z14.s, z16.s\n" + "smin z15.s, p2/M, z15.s, z16.s\n" + "smax z8.s, p2/M, z8.s, z17.s\n" + "smax z9.s, p2/M, z9.s, z17.s\n" + "smax z10.s, p2/M, z10.s, z17.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z13.s, p2/M, z13.s, z5.s\n" - "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z17.s\n" + "smax z12.s, p2/M, z12.s, z17.s\n" + "uzp1 z16.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z16.b\n" + "smax z13.s, p2/M, z13.s, z17.s\n" + "smax z14.s, p2/M, z14.s, z17.s\n" "uzp1 z12.h, z12.h, z13.h\n" "st1b { z8.b }, p1, [x11]\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z12.b, z12.b, z13.b\n" - "st1b { z12.b }, p1, [x24]\n" + "smax z15.s, p2/M, z15.s, z17.s\n" + "uzp1 z16.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z16.b\n" + "st1b { z12.b }, p1, [x26]\n" "addvl x11, x11, #1\n" "26:" // Height 2: Writeback done "decw x10, ALL, MUL #4\n" @@ -617,13 +617,13 @@ void sve_hybrid_s8qs_dot_6x4VL ( "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -632,86 +632,86 @@ void sve_hybrid_s8qs_dot_6x4VL ( "b 32f\n" "31:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "32:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 34f\n" "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x24]\n" + "ld1b { z21.b }, p2/Z, [x9]\n" + "sdot z8.s, z21.b, z2.b[0]\n" + "sdot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z16.s, z21.b, z0.b[0]\n" + "sdot z9.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[0]\n" + "sdot z17.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n" "cmp x27, #0x10\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z10.s, z21.b, z2.b[0]\n" + "sdot z14.s, z21.b, z1.b[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[0]\n" + "sdot z11.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p2/Z, [x9, #4, MUL VL]\n" "add x24, x24, #0x10\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[0]\n" + "sdot z19.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[1]\n" + "sdot z12.s, z21.b, z1.b[1]\n" + "sdot z16.s, z21.b, z0.b[1]\n" + "sdot z9.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[1]\n" + "sdot z17.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z10.s, z21.b, z2.b[1]\n" + "sdot z14.s, z21.b, z1.b[1]\n" + "sdot z18.s, z21.b, z0.b[1]\n" + "sdot z11.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p2/Z, [x9, #-8, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[1]\n" + "sdot z19.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[2]\n" + "sdot z12.s, z21.b, z1.b[2]\n" + "sdot z16.s, z21.b, z0.b[2]\n" + "sdot z9.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p2/Z, [x9, #-6, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[2]\n" + "sdot z17.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z21.b, z2.b[2]\n" + "sdot z14.s, z21.b, z1.b[2]\n" + "sdot z18.s, z21.b, z0.b[2]\n" + "sdot z11.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[2]\n" + "sdot z19.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[3]\n" + "sdot z12.s, z21.b, z1.b[3]\n" + "sdot z16.s, z21.b, z0.b[3]\n" + "sdot z9.s, z20.b, z2.b[3]\n" + "ld1b { z21.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[3]\n" + "sdot z17.s, z20.b, z0.b[3]\n" + "ld1b { z20.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z10.s, z21.b, z2.b[3]\n" + "sdot z14.s, z21.b, z1.b[3]\n" + "sdot z18.s, z21.b, z0.b[3]\n" + "sdot z11.s, z20.b, z2.b[3]\n" + "sdot z15.s, z20.b, z1.b[3]\n" + "sdot z19.s, z20.b, z0.b[3]\n" "bgt 33b\n" "34:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -719,104 +719,104 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x9]\n" + "sdot z8.s, z21.b, z0.b[0]\n" + "sdot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z16.s, z21.b, z2.b[0]\n" + "sdot z9.s, z20.b, z0.b[0]\n" + "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[0]\n" + "sdot z17.s, z20.b, z2.b[0]\n" + "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z10.s, z21.b, z0.b[0]\n" + "sdot z14.s, z21.b, z1.b[0]\n" + "sdot z18.s, z21.b, z2.b[0]\n" + "sdot z11.s, z20.b, z0.b[0]\n" + "sdot z15.s, z20.b, z1.b[0]\n" + "sdot z19.s, z20.b, z2.b[0]\n" "ble 35f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x9]\n" + "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[1]\n" + "sdot z12.s, z21.b, z1.b[1]\n" + "sdot z16.s, z21.b, z2.b[1]\n" + "sdot z9.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[1]\n" + "sdot z17.s, z20.b, z2.b[1]\n" + "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z10.s, z21.b, z0.b[1]\n" + "sdot z14.s, z21.b, z1.b[1]\n" + "sdot z18.s, z21.b, z2.b[1]\n" + "sdot z11.s, z20.b, z0.b[1]\n" + "sdot z15.s, z20.b, z1.b[1]\n" + "sdot z19.s, z20.b, z2.b[1]\n" "ble 35f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x9]\n" + "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[2]\n" + "sdot z12.s, z21.b, z1.b[2]\n" + "sdot z16.s, z21.b, z2.b[2]\n" + "sdot z9.s, z20.b, z0.b[2]\n" + "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[2]\n" + "sdot z17.s, z20.b, z2.b[2]\n" + "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z10.s, z21.b, z0.b[2]\n" + "sdot z14.s, z21.b, z1.b[2]\n" + "sdot z18.s, z21.b, z2.b[2]\n" + "sdot z11.s, z20.b, z0.b[2]\n" + "sdot z15.s, z20.b, z1.b[2]\n" + "sdot z19.s, z20.b, z2.b[2]\n" "ble 35f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x9]\n" + "ld1b { z20.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[3]\n" + "sdot z12.s, z21.b, z1.b[3]\n" + "sdot z16.s, z21.b, z2.b[3]\n" + "sdot z9.s, z20.b, z0.b[3]\n" + "ld1b { z21.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[3]\n" + "sdot z17.s, z20.b, z2.b[3]\n" + "ld1b { z20.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z10.s, z21.b, z0.b[3]\n" + "sdot z14.s, z21.b, z1.b[3]\n" + "sdot z18.s, z21.b, z2.b[3]\n" + "sdot z11.s, z20.b, z0.b[3]\n" + "sdot z15.s, z20.b, z1.b[3]\n" + "sdot z19.s, z20.b, z2.b[3]\n" "35:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 30b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" + "ld1w { z23.s }, p2/Z, [x14]\n" + "add x26, x11, x20\n" + "add x25, x26, x20\n" + "ld1w { z22.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x14, #2, MUL VL]\n" + "add z8.s, z8.s, z23.s\n" + "add z9.s, z9.s, z22.s\n" + "ld1w { z20.s }, p2/Z, [x14, #3, MUL VL]\n" + "add z10.s, z10.s, z21.s\n" + "add z11.s, z11.s, z20.s\n" "addvl x14, x14, #4\n" - "add z12.s, z12.s, z0.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z2.s\n" - "add z15.s, z15.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "add z12.s, z12.s, z23.s\n" + "add z13.s, z13.s, z22.s\n" + "add z14.s, z14.s, z21.s\n" + "add z15.s, z15.s, z20.s\n" + "add z16.s, z16.s, z23.s\n" + "add z17.s, z17.s, z22.s\n" + "add z18.s, z18.s, z21.s\n" + "add z19.s, z19.s, z20.s\n" "tbz %x[flags], #4, 36f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -830,10 +830,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 37f\n" "36:" // Height 3: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -854,109 +854,109 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" "tbz %x[flags], #5, 38f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z12.d, z0.d\n" - "and z5.d, z13.d, z1.d\n" - "and z6.d, z14.d, z2.d\n" - "and z7.d, z15.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z12.s, z12.s, z4.s\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z6.s\n" - "sqadd z15.s, z15.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z23.d, z8.d, z0.d\n" + "and z22.d, z9.d, z1.d\n" + "and z21.d, z10.d, z2.d\n" + "and z20.d, z11.d, z3.d\n" + "asr z23.s, z23.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z8.s, z8.s, z23.s\n" + "sqadd z9.s, z9.s, z22.s\n" + "sqadd z10.s, z10.s, z21.s\n" + "sqadd z11.s, z11.s, z20.s\n" + "and z23.d, z12.d, z0.d\n" + "and z22.d, z13.d, z1.d\n" + "and z21.d, z14.d, z2.d\n" + "and z20.d, z15.d, z3.d\n" + "asr z23.s, z23.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z12.s, z12.s, z23.s\n" + "sqadd z13.s, z13.s, z22.s\n" + "sqadd z14.s, z14.s, z21.s\n" + "sqadd z15.s, z15.s, z20.s\n" + "and z23.d, z16.d, z0.d\n" + "and z22.d, z17.d, z1.d\n" + "and z21.d, z18.d, z2.d\n" + "and z20.d, z19.d, z3.d\n" + "asr z23.s, z23.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z16.s, z16.s, z23.s\n" + "sqadd z17.s, z17.s, z22.s\n" + "sqadd z18.s, z18.s, z21.s\n" + "sqadd z19.s, z19.s, z20.s\n" "38:" // Height 3: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z8.s, z8.s, z4.s\n" + "add z8.s, z8.s, z21.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z21.s\n" + "add z10.s, z10.s, z21.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z12.s, z12.s, z4.s\n" + "add z11.s, z11.s, z21.s\n" + "add z12.s, z12.s, z21.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z13.s, z13.s, z4.s\n" - "add z14.s, z14.s, z4.s\n" + "add z13.s, z13.s, z21.s\n" + "add z14.s, z14.s, z21.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z15.s, z15.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z15.s, z15.s, z21.s\n" + "add z16.s, z16.s, z21.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z21.s\n" + "add z18.s, z18.s, z21.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z19.s, z19.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z21.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z20.s\n" + "smin z9.s, p2/M, z9.s, z20.s\n" + "smin z10.s, p2/M, z10.s, z20.s\n" + "smin z11.s, p2/M, z11.s, z20.s\n" + "smin z12.s, p2/M, z12.s, z20.s\n" + "smin z13.s, p2/M, z13.s, z20.s\n" + "smin z14.s, p2/M, z14.s, z20.s\n" + "smin z15.s, p2/M, z15.s, z20.s\n" + "smin z16.s, p2/M, z16.s, z20.s\n" + "smin z17.s, p2/M, z17.s, z20.s\n" + "smin z18.s, p2/M, z18.s, z20.s\n" + "smin z19.s, p2/M, z19.s, z20.s\n" + "smax z8.s, p2/M, z8.s, z21.s\n" + "smax z9.s, p2/M, z9.s, z21.s\n" + "smax z10.s, p2/M, z10.s, z21.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z13.s, p2/M, z13.s, z5.s\n" - "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z21.s\n" + "smax z12.s, p2/M, z12.s, z21.s\n" + "uzp1 z20.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z20.b\n" + "smax z13.s, p2/M, z13.s, z21.s\n" + "smax z14.s, p2/M, z14.s, z21.s\n" "uzp1 z12.h, z12.h, z13.h\n" "st1b { z8.b }, p1, [x11]\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z12.b, z12.b, z13.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z21.s\n" + "smax z16.s, p2/M, z16.s, z21.s\n" + "uzp1 z20.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z20.b\n" + "smax z17.s, p2/M, z17.s, z21.s\n" + "smax z18.s, p2/M, z18.s, z21.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z12.b }, p1, [x24]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "st1b { z12.b }, p1, [x26]\n" + "smax z19.s, p2/M, z19.s, z21.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" + "st1b { z16.b }, p1, [x25]\n" "addvl x11, x11, #1\n" "39:" // Height 3: Writeback done "decw x10, ALL, MUL #4\n" @@ -994,14 +994,14 @@ void sve_hybrid_s8qs_dot_6x4VL ( "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 44f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1011,105 +1011,105 @@ void sve_hybrid_s8qs_dot_6x4VL ( "b 45f\n" "44:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "45:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 47f\n" "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z3.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[0]\n" + "sdot z12.s, z25.b, z2.b[0]\n" + "sdot z16.s, z25.b, z1.b[0]\n" + "sdot z20.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" "add x25, x25, #0x10\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z9.s, z24.b, z3.b[0]\n" + "sdot z13.s, z24.b, z2.b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "sdot z17.s, z24.b, z1.b[0]\n" + "sdot z21.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[0]\n" + "sdot z14.s, z25.b, z2.b[0]\n" + "sdot z18.s, z25.b, z1.b[0]\n" + "sdot z22.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[0]\n" + "sdot z15.s, z24.b, z2.b[0]\n" + "sdot z19.s, z24.b, z1.b[0]\n" + "sdot z23.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[1]\n" + "sdot z12.s, z25.b, z2.b[1]\n" + "sdot z16.s, z25.b, z1.b[1]\n" + "sdot z20.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[1]\n" + "sdot z13.s, z24.b, z2.b[1]\n" + "sdot z17.s, z24.b, z1.b[1]\n" + "sdot z21.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z10.s, z25.b, z3.b[1]\n" + "sdot z14.s, z25.b, z2.b[1]\n" + "sdot z18.s, z25.b, z1.b[1]\n" + "sdot z22.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[1]\n" + "sdot z15.s, z24.b, z2.b[1]\n" + "sdot z19.s, z24.b, z1.b[1]\n" + "sdot z23.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[2]\n" + "sdot z12.s, z25.b, z2.b[2]\n" + "sdot z16.s, z25.b, z1.b[2]\n" + "sdot z20.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[2]\n" + "sdot z13.s, z24.b, z2.b[2]\n" + "sdot z17.s, z24.b, z1.b[2]\n" + "sdot z21.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[2]\n" + "sdot z14.s, z25.b, z2.b[2]\n" + "sdot z18.s, z25.b, z1.b[2]\n" + "sdot z22.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[2]\n" + "sdot z15.s, z24.b, z2.b[2]\n" + "sdot z19.s, z24.b, z1.b[2]\n" + "sdot z23.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[3]\n" + "sdot z12.s, z25.b, z2.b[3]\n" + "sdot z16.s, z25.b, z1.b[3]\n" + "sdot z20.s, z25.b, z0.b[3]\n" + "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[3]\n" + "sdot z13.s, z24.b, z2.b[3]\n" + "sdot z17.s, z24.b, z1.b[3]\n" + "sdot z21.s, z24.b, z0.b[3]\n" + "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[3]\n" + "sdot z14.s, z25.b, z2.b[3]\n" + "sdot z18.s, z25.b, z1.b[3]\n" + "sdot z22.s, z25.b, z0.b[3]\n" + "sdot z11.s, z24.b, z3.b[3]\n" + "sdot z15.s, z24.b, z2.b[3]\n" + "sdot z19.s, z24.b, z1.b[3]\n" + "sdot z23.s, z24.b, z0.b[3]\n" "bgt 46b\n" "47:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1118,125 +1118,125 @@ void sve_hybrid_s8qs_dot_6x4VL ( "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[0]\n" + "sdot z12.s, z25.b, z1.b[0]\n" + "sdot z16.s, z25.b, z2.b[0]\n" + "sdot z20.s, z25.b, z3.b[0]\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[0]\n" + "sdot z13.s, z24.b, z1.b[0]\n" + "sdot z17.s, z24.b, z2.b[0]\n" + "sdot z21.s, z24.b, z3.b[0]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z10.s, z25.b, z0.b[0]\n" + "sdot z14.s, z25.b, z1.b[0]\n" + "sdot z18.s, z25.b, z2.b[0]\n" + "sdot z22.s, z25.b, z3.b[0]\n" + "sdot z11.s, z24.b, z0.b[0]\n" + "sdot z15.s, z24.b, z1.b[0]\n" + "sdot z19.s, z24.b, z2.b[0]\n" + "sdot z23.s, z24.b, z3.b[0]\n" "ble 48f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[1]\n" + "sdot z12.s, z25.b, z1.b[1]\n" + "sdot z16.s, z25.b, z2.b[1]\n" + "sdot z20.s, z25.b, z3.b[1]\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[1]\n" + "sdot z13.s, z24.b, z1.b[1]\n" + "sdot z17.s, z24.b, z2.b[1]\n" + "sdot z21.s, z24.b, z3.b[1]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z10.s, z25.b, z0.b[1]\n" + "sdot z14.s, z25.b, z1.b[1]\n" + "sdot z18.s, z25.b, z2.b[1]\n" + "sdot z22.s, z25.b, z3.b[1]\n" + "sdot z11.s, z24.b, z0.b[1]\n" + "sdot z15.s, z24.b, z1.b[1]\n" + "sdot z19.s, z24.b, z2.b[1]\n" + "sdot z23.s, z24.b, z3.b[1]\n" "ble 48f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[2]\n" + "sdot z12.s, z25.b, z1.b[2]\n" + "sdot z16.s, z25.b, z2.b[2]\n" + "sdot z20.s, z25.b, z3.b[2]\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[2]\n" + "sdot z13.s, z24.b, z1.b[2]\n" + "sdot z17.s, z24.b, z2.b[2]\n" + "sdot z21.s, z24.b, z3.b[2]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z10.s, z25.b, z0.b[2]\n" + "sdot z14.s, z25.b, z1.b[2]\n" + "sdot z18.s, z25.b, z2.b[2]\n" + "sdot z22.s, z25.b, z3.b[2]\n" + "sdot z11.s, z24.b, z0.b[2]\n" + "sdot z15.s, z24.b, z1.b[2]\n" + "sdot z19.s, z24.b, z2.b[2]\n" + "sdot z23.s, z24.b, z3.b[2]\n" "ble 48f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[3]\n" + "sdot z12.s, z25.b, z1.b[3]\n" + "sdot z16.s, z25.b, z2.b[3]\n" + "sdot z20.s, z25.b, z3.b[3]\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[3]\n" + "sdot z13.s, z24.b, z1.b[3]\n" + "sdot z17.s, z24.b, z2.b[3]\n" + "sdot z21.s, z24.b, z3.b[3]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z10.s, z25.b, z0.b[3]\n" + "sdot z14.s, z25.b, z1.b[3]\n" + "sdot z18.s, z25.b, z2.b[3]\n" + "sdot z22.s, z25.b, z3.b[3]\n" + "sdot z11.s, z24.b, z0.b[3]\n" + "sdot z15.s, z24.b, z1.b[3]\n" + "sdot z19.s, z24.b, z2.b[3]\n" + "sdot z23.s, z24.b, z3.b[3]\n" "48:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 43b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "add x24, x11, x20\n" - "add x23, x24, x20\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "add x22, x23, x20\n" - "add z8.s, z8.s, z0.s\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" + "ld1w { z27.s }, p2/Z, [x14]\n" + "add x26, x11, x20\n" + "add x25, x26, x20\n" + "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" + "add x24, x25, x20\n" + "add z8.s, z8.s, z27.s\n" + "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" + "add z9.s, z9.s, z26.s\n" + "add z10.s, z10.s, z25.s\n" "addvl x14, x14, #4\n" - "add z11.s, z11.s, z3.s\n" - "add z12.s, z12.s, z0.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z2.s\n" - "add z15.s, z15.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" + "add z11.s, z11.s, z24.s\n" + "add z12.s, z12.s, z27.s\n" + "add z13.s, z13.s, z26.s\n" + "add z14.s, z14.s, z25.s\n" + "add z15.s, z15.s, z24.s\n" + "add z16.s, z16.s, z27.s\n" + "add z17.s, z17.s, z26.s\n" + "add z18.s, z18.s, z25.s\n" + "add z19.s, z19.s, z24.s\n" + "add z20.s, z20.s, z27.s\n" + "add z21.s, z21.s, z26.s\n" + "add z22.s, z22.s, z25.s\n" + "add z23.s, z23.s, z24.s\n" "tbz %x[flags], #4, 49f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -1250,10 +1250,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 50f\n" "49:" // Height 4: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -1278,141 +1278,141 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n" ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n" "tbz %x[flags], #5, 51f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z12.d, z0.d\n" - "and z5.d, z13.d, z1.d\n" - "and z6.d, z14.d, z2.d\n" - "and z7.d, z15.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z12.s, z12.s, z4.s\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z6.s\n" - "sqadd z15.s, z15.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "and z4.d, z20.d, z0.d\n" - "and z5.d, z21.d, z1.d\n" - "and z6.d, z22.d, z2.d\n" - "and z7.d, z23.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z20.s, z20.s, z4.s\n" - "sqadd z21.s, z21.s, z5.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "sqadd z23.s, z23.s, z7.s\n" + "and z27.d, z8.d, z0.d\n" + "and z26.d, z9.d, z1.d\n" + "and z25.d, z10.d, z2.d\n" + "and z24.d, z11.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z8.s, z8.s, z27.s\n" + "sqadd z9.s, z9.s, z26.s\n" + "sqadd z10.s, z10.s, z25.s\n" + "sqadd z11.s, z11.s, z24.s\n" + "and z27.d, z12.d, z0.d\n" + "and z26.d, z13.d, z1.d\n" + "and z25.d, z14.d, z2.d\n" + "and z24.d, z15.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z12.s, z12.s, z27.s\n" + "sqadd z13.s, z13.s, z26.s\n" + "sqadd z14.s, z14.s, z25.s\n" + "sqadd z15.s, z15.s, z24.s\n" + "and z27.d, z16.d, z0.d\n" + "and z26.d, z17.d, z1.d\n" + "and z25.d, z18.d, z2.d\n" + "and z24.d, z19.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z16.s, z16.s, z27.s\n" + "sqadd z17.s, z17.s, z26.s\n" + "sqadd z18.s, z18.s, z25.s\n" + "sqadd z19.s, z19.s, z24.s\n" + "and z27.d, z20.d, z0.d\n" + "and z26.d, z21.d, z1.d\n" + "and z25.d, z22.d, z2.d\n" + "and z24.d, z23.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z20.s, z20.s, z27.s\n" + "sqadd z21.s, z21.s, z26.s\n" + "sqadd z22.s, z22.s, z25.s\n" + "sqadd z23.s, z23.s, z24.s\n" "51:" // Height 4: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z8.s, z8.s, z4.s\n" + "add z8.s, z8.s, z25.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z25.s\n" + "add z10.s, z10.s, z25.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z12.s, z12.s, z4.s\n" + "add z11.s, z11.s, z25.s\n" + "add z12.s, z12.s, z25.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z13.s, z13.s, z4.s\n" - "add z14.s, z14.s, z4.s\n" + "add z13.s, z13.s, z25.s\n" + "add z14.s, z14.s, z25.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z15.s, z15.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z15.s, z15.s, z25.s\n" + "add z16.s, z16.s, z25.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z25.s\n" + "add z18.s, z18.s, z25.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z25.s\n" + "add z20.s, z20.s, z25.s\n" ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z25.s\n" + "add z22.s, z22.s, z25.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z23.s, z23.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "add z23.s, z23.s, z25.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z24.s\n" + "smin z9.s, p2/M, z9.s, z24.s\n" + "smin z10.s, p2/M, z10.s, z24.s\n" + "smin z11.s, p2/M, z11.s, z24.s\n" + "smin z12.s, p2/M, z12.s, z24.s\n" + "smin z13.s, p2/M, z13.s, z24.s\n" + "smin z14.s, p2/M, z14.s, z24.s\n" + "smin z15.s, p2/M, z15.s, z24.s\n" + "smin z16.s, p2/M, z16.s, z24.s\n" + "smin z17.s, p2/M, z17.s, z24.s\n" + "smin z18.s, p2/M, z18.s, z24.s\n" + "smin z19.s, p2/M, z19.s, z24.s\n" + "smin z20.s, p2/M, z20.s, z24.s\n" + "smin z21.s, p2/M, z21.s, z24.s\n" + "smin z22.s, p2/M, z22.s, z24.s\n" + "smin z23.s, p2/M, z23.s, z24.s\n" + "smax z8.s, p2/M, z8.s, z25.s\n" + "smax z9.s, p2/M, z9.s, z25.s\n" + "smax z10.s, p2/M, z10.s, z25.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z13.s, p2/M, z13.s, z5.s\n" - "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z25.s\n" + "smax z12.s, p2/M, z12.s, z25.s\n" + "uzp1 z24.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z24.b\n" + "smax z13.s, p2/M, z13.s, z25.s\n" + "smax z14.s, p2/M, z14.s, z25.s\n" "uzp1 z12.h, z12.h, z13.h\n" "st1b { z8.b }, p1, [x11]\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z12.b, z12.b, z13.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z25.s\n" + "smax z16.s, p2/M, z16.s, z25.s\n" + "uzp1 z24.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z24.b\n" + "smax z17.s, p2/M, z17.s, z25.s\n" + "smax z18.s, p2/M, z18.s, z25.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z12.b }, p1, [x24]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "st1b { z12.b }, p1, [x26]\n" + "smax z19.s, p2/M, z19.s, z25.s\n" + "smax z20.s, p2/M, z20.s, z25.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z25.s\n" + "smax z22.s, p2/M, z22.s, z25.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "st1b { z16.b }, p1, [x23]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "st1b { z20.b }, p1, [x22]\n" + "st1b { z16.b }, p1, [x25]\n" + "smax z23.s, p2/M, z23.s, z25.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "st1b { z20.b }, p1, [x24]\n" "addvl x11, x11, #1\n" "52:" // Height 4: Writeback done "decw x10, ALL, MUL #4\n" @@ -1454,15 +1454,15 @@ void sve_hybrid_s8qs_dot_6x4VL ( "56:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 57f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1473,124 +1473,124 @@ void sve_hybrid_s8qs_dot_6x4VL ( "b 58f\n" "57:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "58:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 60f\n" "59:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x26]\n" + "ld1rqb { z3.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" + "ld1rqb { z0.b }, p0/Z, [x22]\n" + "ld1b { z29.b }, p2/Z, [x9]\n" + "sdot z8.s, z29.b, z4.b[0]\n" + "sdot z12.s, z29.b, z3.b[0]\n" + "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z16.s, z29.b, z2.b[0]\n" + "sdot z20.s, z29.b, z1.b[0]\n" "add x25, x25, #0x10\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z24.s, z29.b, z0.b[0]\n" + "sdot z9.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n" "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z13.s, z28.b, z3.b[0]\n" + "sdot z17.s, z28.b, z2.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "sdot z21.s, z28.b, z1.b[0]\n" + "sdot z25.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[0]\n" + "sdot z14.s, z29.b, z3.b[0]\n" + "sdot z18.s, z29.b, z2.b[0]\n" + "sdot z22.s, z29.b, z1.b[0]\n" + "sdot z26.s, z29.b, z0.b[0]\n" + "sdot z11.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p2/Z, [x9, #4, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[0]\n" + "sdot z19.s, z28.b, z2.b[0]\n" + "sdot z23.s, z28.b, z1.b[0]\n" + "sdot z27.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[1]\n" + "sdot z12.s, z29.b, z3.b[1]\n" + "sdot z16.s, z29.b, z2.b[1]\n" + "sdot z20.s, z29.b, z1.b[1]\n" + "sdot z24.s, z29.b, z0.b[1]\n" + "sdot z9.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[1]\n" + "sdot z17.s, z28.b, z2.b[1]\n" + "sdot z21.s, z28.b, z1.b[1]\n" + "sdot z25.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z10.s, z29.b, z4.b[1]\n" + "sdot z14.s, z29.b, z3.b[1]\n" + "sdot z18.s, z29.b, z2.b[1]\n" + "sdot z22.s, z29.b, z1.b[1]\n" + "sdot z26.s, z29.b, z0.b[1]\n" + "sdot z11.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p2/Z, [x9, #-8, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[1]\n" + "sdot z19.s, z28.b, z2.b[1]\n" + "sdot z23.s, z28.b, z1.b[1]\n" + "sdot z27.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[2]\n" + "sdot z12.s, z29.b, z3.b[2]\n" + "sdot z16.s, z29.b, z2.b[2]\n" + "sdot z20.s, z29.b, z1.b[2]\n" + "sdot z24.s, z29.b, z0.b[2]\n" + "sdot z9.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p2/Z, [x9, #-6, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[2]\n" + "sdot z17.s, z28.b, z2.b[2]\n" + "sdot z21.s, z28.b, z1.b[2]\n" + "sdot z25.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[2]\n" + "sdot z14.s, z29.b, z3.b[2]\n" + "sdot z18.s, z29.b, z2.b[2]\n" + "sdot z22.s, z29.b, z1.b[2]\n" + "sdot z26.s, z29.b, z0.b[2]\n" + "sdot z11.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[2]\n" + "sdot z19.s, z28.b, z2.b[2]\n" + "sdot z23.s, z28.b, z1.b[2]\n" + "sdot z27.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[3]\n" + "sdot z12.s, z29.b, z3.b[3]\n" + "sdot z16.s, z29.b, z2.b[3]\n" + "sdot z20.s, z29.b, z1.b[3]\n" + "sdot z24.s, z29.b, z0.b[3]\n" + "sdot z9.s, z28.b, z4.b[3]\n" + "ld1b { z29.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[3]\n" + "sdot z17.s, z28.b, z2.b[3]\n" + "sdot z21.s, z28.b, z1.b[3]\n" + "sdot z25.s, z28.b, z0.b[3]\n" + "ld1b { z28.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[3]\n" + "sdot z14.s, z29.b, z3.b[3]\n" + "sdot z18.s, z29.b, z2.b[3]\n" + "sdot z22.s, z29.b, z1.b[3]\n" + "sdot z26.s, z29.b, z0.b[3]\n" + "sdot z11.s, z28.b, z4.b[3]\n" + "sdot z15.s, z28.b, z3.b[3]\n" + "sdot z19.s, z28.b, z2.b[3]\n" + "sdot z23.s, z28.b, z1.b[3]\n" + "sdot z27.s, z28.b, z0.b[3]\n" "bgt 59b\n" "60:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1600,146 +1600,146 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x9]\n" + "sdot z8.s, z29.b, z0.b[0]\n" + "sdot z12.s, z29.b, z1.b[0]\n" + "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z16.s, z29.b, z2.b[0]\n" + "sdot z20.s, z29.b, z3.b[0]\n" + "sdot z24.s, z29.b, z4.b[0]\n" + "sdot z9.s, z28.b, z0.b[0]\n" + "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[0]\n" + "sdot z17.s, z28.b, z2.b[0]\n" + "sdot z21.s, z28.b, z3.b[0]\n" + "sdot z25.s, z28.b, z4.b[0]\n" + "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z10.s, z29.b, z0.b[0]\n" + "sdot z14.s, z29.b, z1.b[0]\n" + "sdot z18.s, z29.b, z2.b[0]\n" + "sdot z22.s, z29.b, z3.b[0]\n" + "sdot z26.s, z29.b, z4.b[0]\n" + "sdot z11.s, z28.b, z0.b[0]\n" + "sdot z15.s, z28.b, z1.b[0]\n" + "sdot z19.s, z28.b, z2.b[0]\n" + "sdot z23.s, z28.b, z3.b[0]\n" + "sdot z27.s, z28.b, z4.b[0]\n" "ble 61f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z29.b }, p2/Z, [x9]\n" + "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[1]\n" + "sdot z12.s, z29.b, z1.b[1]\n" + "sdot z16.s, z29.b, z2.b[1]\n" + "sdot z20.s, z29.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z24.s, z29.b, z4.b[1]\n" + "sdot z9.s, z28.b, z0.b[1]\n" + "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[1]\n" + "sdot z17.s, z28.b, z2.b[1]\n" + "sdot z21.s, z28.b, z3.b[1]\n" + "sdot z25.s, z28.b, z4.b[1]\n" + "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z10.s, z29.b, z0.b[1]\n" + "sdot z14.s, z29.b, z1.b[1]\n" + "sdot z18.s, z29.b, z2.b[1]\n" + "sdot z22.s, z29.b, z3.b[1]\n" + "sdot z26.s, z29.b, z4.b[1]\n" + "sdot z11.s, z28.b, z0.b[1]\n" + "sdot z15.s, z28.b, z1.b[1]\n" + "sdot z19.s, z28.b, z2.b[1]\n" + "sdot z23.s, z28.b, z3.b[1]\n" + "sdot z27.s, z28.b, z4.b[1]\n" "ble 61f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z29.b }, p2/Z, [x9]\n" + "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[2]\n" + "sdot z12.s, z29.b, z1.b[2]\n" + "sdot z16.s, z29.b, z2.b[2]\n" + "sdot z20.s, z29.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z24.s, z29.b, z4.b[2]\n" + "sdot z9.s, z28.b, z0.b[2]\n" + "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[2]\n" + "sdot z17.s, z28.b, z2.b[2]\n" + "sdot z21.s, z28.b, z3.b[2]\n" + "sdot z25.s, z28.b, z4.b[2]\n" + "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z10.s, z29.b, z0.b[2]\n" + "sdot z14.s, z29.b, z1.b[2]\n" + "sdot z18.s, z29.b, z2.b[2]\n" + "sdot z22.s, z29.b, z3.b[2]\n" + "sdot z26.s, z29.b, z4.b[2]\n" + "sdot z11.s, z28.b, z0.b[2]\n" + "sdot z15.s, z28.b, z1.b[2]\n" + "sdot z19.s, z28.b, z2.b[2]\n" + "sdot z23.s, z28.b, z3.b[2]\n" + "sdot z27.s, z28.b, z4.b[2]\n" "ble 61f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x9]\n" + "ld1b { z28.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[3]\n" + "sdot z12.s, z29.b, z1.b[3]\n" + "sdot z16.s, z29.b, z2.b[3]\n" + "sdot z20.s, z29.b, z3.b[3]\n" + "sdot z24.s, z29.b, z4.b[3]\n" + "sdot z9.s, z28.b, z0.b[3]\n" + "ld1b { z29.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[3]\n" + "sdot z17.s, z28.b, z2.b[3]\n" + "sdot z21.s, z28.b, z3.b[3]\n" + "sdot z25.s, z28.b, z4.b[3]\n" + "ld1b { z28.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z10.s, z29.b, z0.b[3]\n" + "sdot z14.s, z29.b, z1.b[3]\n" + "sdot z18.s, z29.b, z2.b[3]\n" + "sdot z22.s, z29.b, z3.b[3]\n" + "sdot z26.s, z29.b, z4.b[3]\n" + "sdot z11.s, z28.b, z0.b[3]\n" + "sdot z15.s, z28.b, z1.b[3]\n" + "sdot z19.s, z28.b, z2.b[3]\n" + "sdot z23.s, z28.b, z3.b[3]\n" + "sdot z27.s, z28.b, z4.b[3]\n" "61:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 56b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" - "ld1w { z0.s }, p2/Z, [x14]\n" + "add x26, x11, x20\n" + "ld1w { z31.s }, p2/Z, [x14]\n" + "add x25, x26, x20\n" + "ld1w { z30.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x14, #2, MUL VL]\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" + "ld1w { z28.s }, p2/Z, [x14, #3, MUL VL]\n" + "add z8.s, z8.s, z31.s\n" + "add z9.s, z9.s, z30.s\n" "addvl x14, x14, #4\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" - "add z12.s, z12.s, z0.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z2.s\n" - "add z15.s, z15.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" + "add z10.s, z10.s, z29.s\n" + "add z11.s, z11.s, z28.s\n" + "add z12.s, z12.s, z31.s\n" + "add z13.s, z13.s, z30.s\n" + "add z14.s, z14.s, z29.s\n" + "add z15.s, z15.s, z28.s\n" + "add z16.s, z16.s, z31.s\n" + "add z17.s, z17.s, z30.s\n" + "add z18.s, z18.s, z29.s\n" + "add z19.s, z19.s, z28.s\n" + "add z20.s, z20.s, z31.s\n" + "add z21.s, z21.s, z30.s\n" + "add z22.s, z22.s, z29.s\n" + "add z23.s, z23.s, z28.s\n" + "add z24.s, z24.s, z31.s\n" + "add z25.s, z25.s, z30.s\n" + "add z26.s, z26.s, z29.s\n" + "add z27.s, z27.s, z28.s\n" "tbz %x[flags], #4, 62f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -1753,10 +1753,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 63f\n" "62:" // Height 5: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -1785,173 +1785,173 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" "tbz %x[flags], #5, 64f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z12.d, z0.d\n" - "and z5.d, z13.d, z1.d\n" - "and z6.d, z14.d, z2.d\n" - "and z7.d, z15.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z12.s, z12.s, z4.s\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z6.s\n" - "sqadd z15.s, z15.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "and z4.d, z20.d, z0.d\n" - "and z5.d, z21.d, z1.d\n" - "and z6.d, z22.d, z2.d\n" - "and z7.d, z23.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z20.s, z20.s, z4.s\n" - "sqadd z21.s, z21.s, z5.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "sqadd z23.s, z23.s, z7.s\n" - "and z4.d, z24.d, z0.d\n" - "and z5.d, z25.d, z1.d\n" - "and z6.d, z26.d, z2.d\n" - "and z7.d, z27.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z5.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z7.s\n" + "and z31.d, z8.d, z0.d\n" + "and z30.d, z9.d, z1.d\n" + "and z29.d, z10.d, z2.d\n" + "and z28.d, z11.d, z3.d\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z8.s, z8.s, z31.s\n" + "sqadd z9.s, z9.s, z30.s\n" + "sqadd z10.s, z10.s, z29.s\n" + "sqadd z11.s, z11.s, z28.s\n" + "and z31.d, z12.d, z0.d\n" + "and z30.d, z13.d, z1.d\n" + "and z29.d, z14.d, z2.d\n" + "and z28.d, z15.d, z3.d\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z12.s, z12.s, z31.s\n" + "sqadd z13.s, z13.s, z30.s\n" + "sqadd z14.s, z14.s, z29.s\n" + "sqadd z15.s, z15.s, z28.s\n" + "and z31.d, z16.d, z0.d\n" + "and z30.d, z17.d, z1.d\n" + "and z29.d, z18.d, z2.d\n" + "and z28.d, z19.d, z3.d\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z16.s, z16.s, z31.s\n" + "sqadd z17.s, z17.s, z30.s\n" + "sqadd z18.s, z18.s, z29.s\n" + "sqadd z19.s, z19.s, z28.s\n" + "and z31.d, z20.d, z0.d\n" + "and z30.d, z21.d, z1.d\n" + "and z29.d, z22.d, z2.d\n" + "and z28.d, z23.d, z3.d\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z20.s, z20.s, z31.s\n" + "sqadd z21.s, z21.s, z30.s\n" + "sqadd z22.s, z22.s, z29.s\n" + "sqadd z23.s, z23.s, z28.s\n" + "and z31.d, z24.d, z0.d\n" + "and z30.d, z25.d, z1.d\n" + "and z29.d, z26.d, z2.d\n" + "and z28.d, z27.d, z3.d\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z24.s, z24.s, z31.s\n" + "sqadd z25.s, z25.s, z30.s\n" + "sqadd z26.s, z26.s, z29.s\n" + "sqadd z27.s, z27.s, z28.s\n" "64:" // Height 5: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z29.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z8.s, z8.s, z4.s\n" + "add z8.s, z8.s, z29.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z29.s\n" + "add z10.s, z10.s, z29.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z12.s, z12.s, z4.s\n" + "add z11.s, z11.s, z29.s\n" + "add z12.s, z12.s, z29.s\n" ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n" ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n" - "add z13.s, z13.s, z4.s\n" - "add z14.s, z14.s, z4.s\n" + "add z13.s, z13.s, z29.s\n" + "add z14.s, z14.s, z29.s\n" ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z15.s, z15.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z15.s, z15.s, z29.s\n" + "add z16.s, z16.s, z29.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z29.s\n" + "add z18.s, z18.s, z29.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z29.s\n" + "add z20.s, z20.s, z29.s\n" ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n" ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z29.s\n" + "add z22.s, z22.s, z29.s\n" ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z23.s, z23.s, z29.s\n" + "add z24.s, z24.s, z29.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z29.s\n" + "add z26.s, z26.s, z29.s\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z27.s, z27.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z29.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z29.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z28.s\n" + "smin z9.s, p2/M, z9.s, z28.s\n" + "smin z10.s, p2/M, z10.s, z28.s\n" + "smin z11.s, p2/M, z11.s, z28.s\n" + "smin z12.s, p2/M, z12.s, z28.s\n" + "smin z13.s, p2/M, z13.s, z28.s\n" + "smin z14.s, p2/M, z14.s, z28.s\n" + "smin z15.s, p2/M, z15.s, z28.s\n" + "smin z16.s, p2/M, z16.s, z28.s\n" + "smin z17.s, p2/M, z17.s, z28.s\n" + "smin z18.s, p2/M, z18.s, z28.s\n" + "smin z19.s, p2/M, z19.s, z28.s\n" + "smin z20.s, p2/M, z20.s, z28.s\n" + "smin z21.s, p2/M, z21.s, z28.s\n" + "smin z22.s, p2/M, z22.s, z28.s\n" + "smin z23.s, p2/M, z23.s, z28.s\n" + "smin z24.s, p2/M, z24.s, z28.s\n" + "smin z25.s, p2/M, z25.s, z28.s\n" + "smin z26.s, p2/M, z26.s, z28.s\n" + "smin z27.s, p2/M, z27.s, z28.s\n" + "smax z8.s, p2/M, z8.s, z29.s\n" + "smax z9.s, p2/M, z9.s, z29.s\n" + "smax z10.s, p2/M, z10.s, z29.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z13.s, p2/M, z13.s, z5.s\n" - "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z29.s\n" + "smax z12.s, p2/M, z12.s, z29.s\n" + "uzp1 z28.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z28.b\n" + "smax z13.s, p2/M, z13.s, z29.s\n" + "smax z14.s, p2/M, z14.s, z29.s\n" "uzp1 z12.h, z12.h, z13.h\n" "st1b { z8.b }, p1, [x11]\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z12.b, z12.b, z13.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z29.s\n" + "smax z16.s, p2/M, z16.s, z29.s\n" + "uzp1 z28.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z28.b\n" + "smax z17.s, p2/M, z17.s, z29.s\n" + "smax z18.s, p2/M, z18.s, z29.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z12.b }, p1, [x24]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "st1b { z12.b }, p1, [x26]\n" + "smax z19.s, p2/M, z19.s, z29.s\n" + "smax z20.s, p2/M, z20.s, z29.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z29.s\n" + "smax z22.s, p2/M, z22.s, z29.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "st1b { z16.b }, p1, [x23]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "st1b { z16.b }, p1, [x25]\n" + "smax z23.s, p2/M, z23.s, z29.s\n" + "smax z24.s, p2/M, z24.s, z29.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z29.s\n" + "smax z26.s, p2/M, z26.s, z29.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z20.b }, p1, [x22]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x21]\n" + "st1b { z20.b }, p1, [x24]\n" + "smax z27.s, p2/M, z27.s, z29.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x23]\n" "addvl x11, x11, #1\n" "65:" // Height 5: Writeback done "decw x10, ALL, MUL #4\n" @@ -2000,16 +2000,16 @@ void sve_hybrid_s8qs_dot_6x4VL ( "69:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 70f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 71f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -2021,143 +2021,143 @@ void sve_hybrid_s8qs_dot_6x4VL ( "b 71f\n" "70:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "71:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 73f\n" "72:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z6.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z1.b }, p2/Z, [x9]\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[0]\n" + "sdot z12.s, z1.b, z6.b[0]\n" + "sdot z16.s, z1.b, z5.b[0]\n" + "sdot z20.s, z1.b, z4.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z24.s, z1.b, z3.b[0]\n" + "sdot z28.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n" "add x21, x21, #0x10\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "sdot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #7, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[0]\n" + "sdot z13.s, z0.b, z6.b[0]\n" + "sdot z17.s, z0.b, z5.b[0]\n" + "sdot z21.s, z0.b, z4.b[0]\n" + "sdot z25.s, z0.b, z3.b[0]\n" + "sdot z29.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[0]\n" + "sdot z14.s, z1.b, z6.b[0]\n" + "sdot z18.s, z1.b, z5.b[0]\n" + "sdot z22.s, z1.b, z4.b[0]\n" + "sdot z26.s, z1.b, z3.b[0]\n" + "sdot z30.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[0]\n" + "sdot z15.s, z0.b, z6.b[0]\n" + "sdot z19.s, z0.b, z5.b[0]\n" + "sdot z23.s, z0.b, z4.b[0]\n" + "sdot z27.s, z0.b, z3.b[0]\n" + "sdot z31.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[1]\n" + "sdot z12.s, z1.b, z6.b[1]\n" + "sdot z16.s, z1.b, z5.b[1]\n" + "sdot z20.s, z1.b, z4.b[1]\n" + "sdot z24.s, z1.b, z3.b[1]\n" + "sdot z28.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[1]\n" + "sdot z13.s, z0.b, z6.b[1]\n" + "sdot z17.s, z0.b, z5.b[1]\n" + "sdot z21.s, z0.b, z4.b[1]\n" + "sdot z25.s, z0.b, z3.b[1]\n" + "sdot z29.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "sdot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "sdot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z30.s, z6.b, z5.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" - "sdot z31.s, z7.b, z5.b[3]\n" + "sdot z10.s, z1.b, z7.b[1]\n" + "sdot z14.s, z1.b, z6.b[1]\n" + "sdot z18.s, z1.b, z5.b[1]\n" + "sdot z22.s, z1.b, z4.b[1]\n" + "sdot z26.s, z1.b, z3.b[1]\n" + "sdot z30.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[1]\n" + "sdot z15.s, z0.b, z6.b[1]\n" + "sdot z19.s, z0.b, z5.b[1]\n" + "sdot z23.s, z0.b, z4.b[1]\n" + "sdot z27.s, z0.b, z3.b[1]\n" + "sdot z31.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[2]\n" + "sdot z12.s, z1.b, z6.b[2]\n" + "sdot z16.s, z1.b, z5.b[2]\n" + "sdot z20.s, z1.b, z4.b[2]\n" + "sdot z24.s, z1.b, z3.b[2]\n" + "sdot z28.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[2]\n" + "sdot z13.s, z0.b, z6.b[2]\n" + "sdot z17.s, z0.b, z5.b[2]\n" + "sdot z21.s, z0.b, z4.b[2]\n" + "sdot z25.s, z0.b, z3.b[2]\n" + "sdot z29.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[2]\n" + "sdot z14.s, z1.b, z6.b[2]\n" + "sdot z18.s, z1.b, z5.b[2]\n" + "sdot z22.s, z1.b, z4.b[2]\n" + "sdot z26.s, z1.b, z3.b[2]\n" + "sdot z30.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[2]\n" + "sdot z15.s, z0.b, z6.b[2]\n" + "sdot z19.s, z0.b, z5.b[2]\n" + "sdot z23.s, z0.b, z4.b[2]\n" + "sdot z27.s, z0.b, z3.b[2]\n" + "sdot z31.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[3]\n" + "sdot z12.s, z1.b, z6.b[3]\n" + "sdot z16.s, z1.b, z5.b[3]\n" + "sdot z20.s, z1.b, z4.b[3]\n" + "sdot z24.s, z1.b, z3.b[3]\n" + "sdot z28.s, z1.b, z2.b[3]\n" + "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[3]\n" + "sdot z13.s, z0.b, z6.b[3]\n" + "sdot z17.s, z0.b, z5.b[3]\n" + "sdot z21.s, z0.b, z4.b[3]\n" + "sdot z25.s, z0.b, z3.b[3]\n" + "sdot z29.s, z0.b, z2.b[3]\n" + "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[3]\n" + "sdot z14.s, z1.b, z6.b[3]\n" + "sdot z18.s, z1.b, z5.b[3]\n" + "sdot z22.s, z1.b, z4.b[3]\n" + "sdot z26.s, z1.b, z3.b[3]\n" + "sdot z30.s, z1.b, z2.b[3]\n" + "sdot z11.s, z0.b, z7.b[3]\n" + "sdot z15.s, z0.b, z6.b[3]\n" + "sdot z19.s, z0.b, z5.b[3]\n" + "sdot z23.s, z0.b, z4.b[3]\n" + "sdot z27.s, z0.b, z3.b[3]\n" + "sdot z31.s, z0.b, z2.b[3]\n" "bgt 72b\n" "73:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -2168,167 +2168,167 @@ void sve_hybrid_s8qs_dot_6x4VL ( "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" "ld1rqb { z5.b }, p0/Z, [x21]\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[0]\n" + "sdot z12.s, z7.b, z1.b[0]\n" + "sdot z16.s, z7.b, z2.b[0]\n" + "sdot z20.s, z7.b, z3.b[0]\n" + "sdot z24.s, z7.b, z4.b[0]\n" + "sdot z28.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[0]\n" + "sdot z13.s, z6.b, z1.b[0]\n" + "sdot z17.s, z6.b, z2.b[0]\n" + "sdot z21.s, z6.b, z3.b[0]\n" + "sdot z25.s, z6.b, z4.b[0]\n" + "sdot z29.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z30.s, z6.b, z5.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "sdot z31.s, z7.b, z5.b[0]\n" + "sdot z10.s, z7.b, z0.b[0]\n" + "sdot z14.s, z7.b, z1.b[0]\n" + "sdot z18.s, z7.b, z2.b[0]\n" + "sdot z22.s, z7.b, z3.b[0]\n" + "sdot z26.s, z7.b, z4.b[0]\n" + "sdot z30.s, z7.b, z5.b[0]\n" + "sdot z11.s, z6.b, z0.b[0]\n" + "sdot z15.s, z6.b, z1.b[0]\n" + "sdot z19.s, z6.b, z2.b[0]\n" + "sdot z23.s, z6.b, z3.b[0]\n" + "sdot z27.s, z6.b, z4.b[0]\n" + "sdot z31.s, z6.b, z5.b[0]\n" "ble 74f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[1]\n" + "sdot z12.s, z7.b, z1.b[1]\n" + "sdot z16.s, z7.b, z2.b[1]\n" + "sdot z20.s, z7.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z24.s, z7.b, z4.b[1]\n" + "sdot z28.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[1]\n" + "sdot z13.s, z6.b, z1.b[1]\n" + "sdot z17.s, z6.b, z2.b[1]\n" + "sdot z21.s, z6.b, z3.b[1]\n" + "sdot z25.s, z6.b, z4.b[1]\n" + "sdot z29.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z30.s, z6.b, z5.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "sdot z31.s, z7.b, z5.b[1]\n" + "sdot z10.s, z7.b, z0.b[1]\n" + "sdot z14.s, z7.b, z1.b[1]\n" + "sdot z18.s, z7.b, z2.b[1]\n" + "sdot z22.s, z7.b, z3.b[1]\n" + "sdot z26.s, z7.b, z4.b[1]\n" + "sdot z30.s, z7.b, z5.b[1]\n" + "sdot z11.s, z6.b, z0.b[1]\n" + "sdot z15.s, z6.b, z1.b[1]\n" + "sdot z19.s, z6.b, z2.b[1]\n" + "sdot z23.s, z6.b, z3.b[1]\n" + "sdot z27.s, z6.b, z4.b[1]\n" + "sdot z31.s, z6.b, z5.b[1]\n" "ble 74f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[2]\n" + "sdot z12.s, z7.b, z1.b[2]\n" + "sdot z16.s, z7.b, z2.b[2]\n" + "sdot z20.s, z7.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "sdot z24.s, z7.b, z4.b[2]\n" + "sdot z28.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[2]\n" + "sdot z13.s, z6.b, z1.b[2]\n" + "sdot z17.s, z6.b, z2.b[2]\n" + "sdot z21.s, z6.b, z3.b[2]\n" + "sdot z25.s, z6.b, z4.b[2]\n" + "sdot z29.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z30.s, z6.b, z5.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "sdot z31.s, z7.b, z5.b[2]\n" + "sdot z10.s, z7.b, z0.b[2]\n" + "sdot z14.s, z7.b, z1.b[2]\n" + "sdot z18.s, z7.b, z2.b[2]\n" + "sdot z22.s, z7.b, z3.b[2]\n" + "sdot z26.s, z7.b, z4.b[2]\n" + "sdot z30.s, z7.b, z5.b[2]\n" + "sdot z11.s, z6.b, z0.b[2]\n" + "sdot z15.s, z6.b, z1.b[2]\n" + "sdot z19.s, z6.b, z2.b[2]\n" + "sdot z23.s, z6.b, z3.b[2]\n" + "sdot z27.s, z6.b, z4.b[2]\n" + "sdot z31.s, z6.b, z5.b[2]\n" "ble 74f\n" - "ld1b { z6.b }, p2/Z, [x9]\n" - "ld1b { z7.b }, p2/Z, [x9, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p2/Z, [x9, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p2/Z, [x9, #3, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x9]\n" + "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[3]\n" + "sdot z12.s, z7.b, z1.b[3]\n" + "sdot z16.s, z7.b, z2.b[3]\n" + "sdot z20.s, z7.b, z3.b[3]\n" + "sdot z24.s, z7.b, z4.b[3]\n" + "sdot z28.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[3]\n" + "sdot z13.s, z6.b, z1.b[3]\n" + "sdot z17.s, z6.b, z2.b[3]\n" + "sdot z21.s, z6.b, z3.b[3]\n" + "sdot z25.s, z6.b, z4.b[3]\n" + "sdot z29.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z30.s, z6.b, z5.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" - "sdot z31.s, z7.b, z5.b[3]\n" + "sdot z10.s, z7.b, z0.b[3]\n" + "sdot z14.s, z7.b, z1.b[3]\n" + "sdot z18.s, z7.b, z2.b[3]\n" + "sdot z22.s, z7.b, z3.b[3]\n" + "sdot z26.s, z7.b, z4.b[3]\n" + "sdot z30.s, z7.b, z5.b[3]\n" + "sdot z11.s, z6.b, z0.b[3]\n" + "sdot z15.s, z6.b, z1.b[3]\n" + "sdot z19.s, z6.b, z2.b[3]\n" + "sdot z23.s, z6.b, z3.b[3]\n" + "sdot z27.s, z6.b, z4.b[3]\n" + "sdot z31.s, z6.b, z5.b[3]\n" "74:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 69b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x11, x20\n" + "add x26, x11, x20\n" + "add x25, x26, x20\n" + "ld1w { z3.s }, p2/Z, [x14]\n" + "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" + "add x24, x25, x20\n" "add x23, x24, x20\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" "add x22, x23, x20\n" - "add x21, x22, x20\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add x20, x21, x20\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" + "add z8.s, z8.s, z3.s\n" + "add z9.s, z9.s, z2.s\n" + "add z10.s, z10.s, z1.s\n" + "add z11.s, z11.s, z0.s\n" "addvl x14, x14, #4\n" - "add z12.s, z12.s, z0.s\n" - "add z13.s, z13.s, z1.s\n" - "add z14.s, z14.s, z2.s\n" - "add z15.s, z15.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z1.s\n" - "add z30.s, z30.s, z2.s\n" - "add z31.s, z31.s, z3.s\n" + "add z12.s, z12.s, z3.s\n" + "add z13.s, z13.s, z2.s\n" + "add z14.s, z14.s, z1.s\n" + "add z15.s, z15.s, z0.s\n" + "add z16.s, z16.s, z3.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z1.s\n" + "add z19.s, z19.s, z0.s\n" + "add z20.s, z20.s, z3.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z1.s\n" + "add z23.s, z23.s, z0.s\n" + "add z24.s, z24.s, z3.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z1.s\n" + "add z27.s, z27.s, z0.s\n" + "add z28.s, z28.s, z3.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z1.s\n" + "add z31.s, z31.s, z0.s\n" "tbz %x[flags], #4, 75f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -2342,10 +2342,10 @@ void sve_hybrid_s8qs_dot_6x4VL ( "addvl x13, x13, #4\n" "b 76f\n" "75:" // Height 6: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -2378,81 +2378,81 @@ void sve_hybrid_s8qs_dot_6x4VL ( ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n" ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n" "tbz %x[flags], #5, 77f\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" + "and z7.d, z8.d, z0.d\n" + "and z6.d, z9.d, z1.d\n" + "and z5.d, z10.d, z2.d\n" + "and z4.d, z11.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z12.d, z0.d\n" - "and z5.d, z13.d, z1.d\n" - "and z6.d, z14.d, z2.d\n" - "and z7.d, z15.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z12.s, z12.s, z4.s\n" - "sqadd z13.s, z13.s, z5.s\n" - "sqadd z14.s, z14.s, z6.s\n" - "sqadd z15.s, z15.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "and z4.d, z20.d, z0.d\n" - "and z5.d, z21.d, z1.d\n" - "and z6.d, z22.d, z2.d\n" - "and z7.d, z23.d, z3.d\n" "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" + "sqadd z8.s, z8.s, z7.s\n" + "sqadd z9.s, z9.s, z6.s\n" + "sqadd z10.s, z10.s, z5.s\n" + "sqadd z11.s, z11.s, z4.s\n" + "and z7.d, z12.d, z0.d\n" + "and z6.d, z13.d, z1.d\n" + "and z5.d, z14.d, z2.d\n" + "and z4.d, z15.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z20.s, z20.s, z4.s\n" - "sqadd z21.s, z21.s, z5.s\n" - "sqadd z22.s, z22.s, z6.s\n" - "sqadd z23.s, z23.s, z7.s\n" - "and z4.d, z24.d, z0.d\n" - "and z5.d, z25.d, z1.d\n" - "and z6.d, z26.d, z2.d\n" - "and z7.d, z27.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z12.s, z12.s, z7.s\n" + "sqadd z13.s, z13.s, z6.s\n" + "sqadd z14.s, z14.s, z5.s\n" + "sqadd z15.s, z15.s, z4.s\n" + "and z7.d, z16.d, z0.d\n" + "and z6.d, z17.d, z1.d\n" + "and z5.d, z18.d, z2.d\n" + "and z4.d, z19.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z5.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z7.s\n" - "and z4.d, z28.d, z0.d\n" - "and z5.d, z29.d, z1.d\n" - "and z6.d, z30.d, z2.d\n" - "and z7.d, z31.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "sqadd z16.s, z16.s, z7.s\n" + "sqadd z17.s, z17.s, z6.s\n" + "sqadd z18.s, z18.s, z5.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "and z7.d, z20.d, z0.d\n" + "and z6.d, z21.d, z1.d\n" + "and z5.d, z22.d, z2.d\n" + "and z4.d, z23.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z20.s, z20.s, z7.s\n" + "sqadd z21.s, z21.s, z6.s\n" + "sqadd z22.s, z22.s, z5.s\n" + "sqadd z23.s, z23.s, z4.s\n" + "and z7.d, z24.d, z0.d\n" + "and z6.d, z25.d, z1.d\n" + "and z5.d, z26.d, z2.d\n" + "and z4.d, z27.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z24.s, z24.s, z7.s\n" + "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z5.s\n" + "sqadd z27.s, z27.s, z4.s\n" + "and z7.d, z28.d, z0.d\n" + "and z6.d, z29.d, z1.d\n" + "and z5.d, z30.d, z2.d\n" + "and z4.d, z31.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z28.s, z28.s, z4.s\n" - "sqadd z29.s, z29.s, z5.s\n" - "sqadd z30.s, z30.s, z6.s\n" - "sqadd z31.s, z31.s, z7.s\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z28.s, z28.s, z7.s\n" + "sqadd z29.s, z29.s, z6.s\n" + "sqadd z30.s, z30.s, z5.s\n" + "sqadd z31.s, z31.s, z4.s\n" "77:" // Height 6: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" "add z8.s, z8.s, z4.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" @@ -2500,83 +2500,83 @@ void sve_hybrid_s8qs_dot_6x4VL ( "add z29.s, z29.s, z4.s\n" "add z30.s, z30.s, z4.s\n" ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" "add z31.s, z31.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "smin z8.s, p2/M, z8.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z0.s\n" + "smin z10.s, p2/M, z10.s, z0.s\n" + "smin z11.s, p2/M, z11.s, z0.s\n" + "smin z12.s, p2/M, z12.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z0.s\n" + "smin z14.s, p2/M, z14.s, z0.s\n" + "smin z15.s, p2/M, z15.s, z0.s\n" + "smin z16.s, p2/M, z16.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z0.s\n" + "smin z18.s, p2/M, z18.s, z0.s\n" + "smin z19.s, p2/M, z19.s, z0.s\n" + "smin z20.s, p2/M, z20.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z0.s\n" + "smin z22.s, p2/M, z22.s, z0.s\n" + "smin z23.s, p2/M, z23.s, z0.s\n" + "smin z24.s, p2/M, z24.s, z0.s\n" + "smin z25.s, p2/M, z25.s, z0.s\n" + "smin z26.s, p2/M, z26.s, z0.s\n" + "smin z27.s, p2/M, z27.s, z0.s\n" + "smin z28.s, p2/M, z28.s, z0.s\n" + "smin z29.s, p2/M, z29.s, z0.s\n" + "smin z30.s, p2/M, z30.s, z0.s\n" + "smin z31.s, p2/M, z31.s, z0.s\n" + "smax z8.s, p2/M, z8.s, z1.s\n" + "smax z9.s, p2/M, z9.s, z1.s\n" + "smax z10.s, p2/M, z10.s, z1.s\n" "uzp1 z8.h, z8.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z13.s, p2/M, z13.s, z5.s\n" - "smax z14.s, p2/M, z14.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z1.s\n" + "smax z12.s, p2/M, z12.s, z1.s\n" + "uzp1 z0.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z0.b\n" + "smax z13.s, p2/M, z13.s, z1.s\n" + "smax z14.s, p2/M, z14.s, z1.s\n" "uzp1 z12.h, z12.h, z13.h\n" "st1b { z8.b }, p1, [x11]\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "uzp1 z13.h, z14.h, z15.h\n" - "uzp1 z12.b, z12.b, z13.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z15.s, p2/M, z15.s, z1.s\n" + "smax z16.s, p2/M, z16.s, z1.s\n" + "uzp1 z0.h, z14.h, z15.h\n" + "uzp1 z12.b, z12.b, z0.b\n" + "smax z17.s, p2/M, z17.s, z1.s\n" + "smax z18.s, p2/M, z18.s, z1.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z12.b }, p1, [x24]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "st1b { z12.b }, p1, [x26]\n" + "smax z19.s, p2/M, z19.s, z1.s\n" + "smax z20.s, p2/M, z20.s, z1.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z1.s\n" + "smax z22.s, p2/M, z22.s, z1.s\n" "uzp1 z20.h, z20.h, z21.h\n" - "st1b { z16.b }, p1, [x23]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "st1b { z16.b }, p1, [x25]\n" + "smax z23.s, p2/M, z23.s, z1.s\n" + "smax z24.s, p2/M, z24.s, z1.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z1.s\n" + "smax z26.s, p2/M, z26.s, z1.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z20.b }, p1, [x22]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "smax z29.s, p2/M, z29.s, z5.s\n" - "smax z30.s, p2/M, z30.s, z5.s\n" + "st1b { z20.b }, p1, [x24]\n" + "smax z27.s, p2/M, z27.s, z1.s\n" + "smax z28.s, p2/M, z28.s, z1.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "smax z29.s, p2/M, z29.s, z1.s\n" + "smax z30.s, p2/M, z30.s, z1.s\n" "uzp1 z28.h, z28.h, z29.h\n" - "st1b { z24.b }, p1, [x21]\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z28.b, z28.b, z29.b\n" - "st1b { z28.b }, p1, [x20]\n" + "st1b { z24.b }, p1, [x23]\n" + "smax z31.s, p2/M, z31.s, z1.s\n" + "uzp1 z16.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z16.b\n" + "st1b { z28.b }, p1, [x22]\n" "addvl x11, x11, #1\n" "78:" // Height 6: Writeback done "decw x10, ALL, MUL #4\n" @@ -2594,7 +2594,6 @@ void sve_hybrid_s8qs_dot_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -2602,4 +2601,4 @@ void sve_hybrid_s8qs_dot_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp index 2b7ad8bf4b..b1b1135c73 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp index 6041794bdb..cd5f85411c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp @@ -117,11 +117,11 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -133,86 +133,86 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "ble 8f\n" "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n" + ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n" + ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n" + ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n" + ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "bgt 7b\n" "8:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" "addvl x9, x9, #8\n" "ble 9f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" + ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" + ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" + ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" + ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" "addvl x9, x9, #8\n" "9:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -221,18 +221,18 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "bne 4b\n" "uzp1 z8.d, z8.d, z12.d\n" "uzp1 z9.d, z9.d, z13.d\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x14]\n" + "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" "uzp1 z10.d, z10.d, z14.d\n" "uzp1 z11.d, z11.d, z15.d\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" "mov z15.d, z8.d\n" - "add z15.s, z15.s, z0.s\n" + "add z15.s, z15.s, z19.s\n" "addvl x14, x14, #4\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" + "add z9.s, z9.s, z18.s\n" + "add z10.s, z10.s, z17.s\n" + "add z11.s, z11.s, z16.s\n" "tbz %x[flags], #4, 10f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -246,10 +246,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 11f\n" "10:" // Height 1: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -262,44 +262,44 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" "tbz %x[flags], #5, 12f\n" - "and z4.d, z15.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z15.s, z15.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" + "and z19.d, z15.d, z0.d\n" + "and z18.d, z9.d, z1.d\n" + "and z17.d, z10.d, z2.d\n" + "and z16.d, z11.d, z3.d\n" + "asr z19.s, z19.s, #0x1f\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z15.s, z15.s, z19.s\n" + "sqadd z9.s, z9.s, z18.s\n" + "sqadd z10.s, z10.s, z17.s\n" + "sqadd z11.s, z11.s, z16.s\n" "12:" // Height 1: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z15.s, z15.s, z4.s\n" + "add z15.s, z15.s, z17.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z17.s\n" + "add z10.s, z10.s, z17.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z11.s, z11.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "add z11.s, z11.s, z17.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z31.s }, p2/Z, [x20]\n" + "smin z15.s, p2/M, z15.s, z16.s\n" + "smin z9.s, p2/M, z9.s, z16.s\n" + "smin z10.s, p2/M, z10.s, z16.s\n" + "smin z11.s, p2/M, z11.s, z16.s\n" + "smax z15.s, p2/M, z15.s, z31.s\n" + "smax z9.s, p2/M, z9.s, z31.s\n" + "smax z10.s, p2/M, z10.s, z31.s\n" "uzp1 z15.h, z15.h, z9.h\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z15.b, z15.b, z9.b\n" + "smax z11.s, p2/M, z11.s, z31.s\n" + "uzp1 z16.h, z10.h, z11.h\n" + "uzp1 z15.b, z15.b, z16.b\n" "st1b { z15.b }, p1, [x11]\n" "addvl x11, x11, #1\n" "13:" // Height 1: Writeback done @@ -330,12 +330,12 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "17:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 18f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 19f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -343,125 +343,125 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "b 19f\n" "18:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "19:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 21f\n" "20:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-8, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n" + ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n" + ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n" + ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #-2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n" + ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "bgt 20b\n" "21:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" "addvl x9, x9, #8\n" "ble 22f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p2/Z, [x9]\n" + "ld1b { z16.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" + ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" + ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" + ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p2/Z, [x9, #6, MUL VL]\n" + "ld1b { z16.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" + ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" "addvl x9, x9, #8\n" "22:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 17b\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z20.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ld1w { z0.s }, p2/Z, [x14]\n" + "ld1w { z19.s }, p2/Z, [x14]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x14, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add x24, x11, x20\n" + "ld1w { z16.s }, p2/Z, [x14, #3, MUL VL]\n" + "add x26, x11, x20\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" "addvl x14, x14, #4\n" - "mov z15.d, z7.d\n" - "add z15.s, z15.s, z0.s\n" - "add z12.s, z12.s, z1.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z3.s\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" + "mov z15.d, z20.d\n" + "add z15.s, z15.s, z19.s\n" + "add z12.s, z12.s, z18.s\n" + "add z13.s, z13.s, z17.s\n" + "add z14.s, z14.s, z16.s\n" + "add z8.s, z8.s, z19.s\n" + "add z9.s, z9.s, z18.s\n" + "add z10.s, z10.s, z17.s\n" + "add z11.s, z11.s, z16.s\n" "tbz %x[flags], #4, 23f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -475,10 +475,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 24f\n" "23:" // Height 2: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -495,77 +495,77 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n" ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n" "tbz %x[flags], #5, 25f\n" - "and z4.d, z15.d, z0.d\n" - "and z5.d, z12.d, z1.d\n" - "and z6.d, z13.d, z2.d\n" - "and z7.d, z14.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z15.s, z15.s, z4.s\n" - "sqadd z12.s, z12.s, z5.s\n" - "sqadd z13.s, z13.s, z6.s\n" - "sqadd z14.s, z14.s, z7.s\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" + "and z19.d, z15.d, z0.d\n" + "and z18.d, z12.d, z1.d\n" + "and z17.d, z13.d, z2.d\n" + "and z16.d, z14.d, z3.d\n" + "asr z19.s, z19.s, #0x1f\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z15.s, z15.s, z19.s\n" + "sqadd z12.s, z12.s, z18.s\n" + "sqadd z13.s, z13.s, z17.s\n" + "sqadd z14.s, z14.s, z16.s\n" + "and z18.d, z8.d, z0.d\n" + "and z24.d, z9.d, z1.d\n" + "and z17.d, z10.d, z2.d\n" + "and z16.d, z11.d, z3.d\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "asr z17.s, z17.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z8.s, z8.s, z18.s\n" + "sqadd z9.s, z9.s, z24.s\n" + "sqadd z10.s, z10.s, z17.s\n" + "sqadd z11.s, z11.s, z16.s\n" "25:" // Height 2: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z15.s, z15.s, z4.s\n" + "add z15.s, z15.s, z17.s\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" - "add z12.s, z12.s, z4.s\n" - "add z13.s, z13.s, z4.s\n" + "add z12.s, z12.s, z17.s\n" + "add z13.s, z13.s, z17.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z14.s, z14.s, z4.s\n" - "add z8.s, z8.s, z4.s\n" + "add z14.s, z14.s, z17.s\n" + "add z8.s, z8.s, z17.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z17.s\n" + "add z10.s, z10.s, z17.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z11.s, z11.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "smax z13.s, p2/M, z13.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "add z11.s, z11.s, z17.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z17.s }, p2/Z, [x20]\n" + "smin z15.s, p2/M, z15.s, z16.s\n" + "smin z12.s, p2/M, z12.s, z16.s\n" + "smin z13.s, p2/M, z13.s, z16.s\n" + "smin z14.s, p2/M, z14.s, z16.s\n" + "smin z8.s, p2/M, z8.s, z16.s\n" + "smin z9.s, p2/M, z9.s, z16.s\n" + "smin z10.s, p2/M, z10.s, z16.s\n" + "smin z11.s, p2/M, z11.s, z16.s\n" + "smax z15.s, p2/M, z15.s, z17.s\n" + "smax z12.s, p2/M, z12.s, z17.s\n" + "smax z13.s, p2/M, z13.s, z17.s\n" "uzp1 z15.h, z15.h, z12.h\n" - "smax z14.s, p2/M, z14.s, z5.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "uzp1 z12.h, z13.h, z14.h\n" - "uzp1 z15.b, z15.b, z12.b\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z17.s\n" + "smax z8.s, p2/M, z8.s, z17.s\n" + "uzp1 z16.h, z13.h, z14.h\n" + "uzp1 z15.b, z15.b, z16.b\n" + "smax z9.s, p2/M, z9.s, z17.s\n" + "smax z10.s, p2/M, z10.s, z17.s\n" "uzp1 z8.h, z8.h, z9.h\n" "st1b { z15.b }, p1, [x11]\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "st1b { z8.b }, p1, [x24]\n" + "smax z11.s, p2/M, z11.s, z17.s\n" + "uzp1 z16.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z16.b\n" + "st1b { z8.b }, p1, [x26]\n" "addvl x11, x11, #1\n" "26:" // Height 2: Writeback done "decw x10, ALL, MUL #4\n" @@ -603,13 +603,13 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "30:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 31f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 32f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -618,174 +618,174 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "b 32f\n" "31:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "32:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 34f\n" "33:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "trn1 z27.d, z30.d, z24.d\n" + "trn2 z30.d, z30.d, z24.d\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "trn1 z26.d, z28.d, z29.d\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z29.d\n" + ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n" + ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n" + ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n" + ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n" + ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n" + ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n" + ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n" + ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n" + ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n" + ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n" "bgt 33b\n" "34:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" + "trn1 z27.d, z1.d, z24.d\n" + "trn2 z1.d, z1.d, z24.d\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "trn1 z26.d, z3.d, z28.d\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z28.d\n" + ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" "ble 35f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" + ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" + ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" + ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" + ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" + ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" + ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" + ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" + ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" + ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" + ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" "35:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 30b\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z28.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z0.s }, p2/Z, [x14]\n" + "ld1w { z27.s }, p2/Z, [x14]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add x24, x11, x20\n" + "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" + "add x26, x11, x20\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x23, x24, x20\n" + "add x25, x26, x20\n" "addvl x14, x14, #4\n" "uzp1 z16.d, z16.d, z20.d\n" "uzp1 z17.d, z17.d, z21.d\n" "uzp1 z18.d, z18.d, z22.d\n" "uzp1 z19.d, z19.d, z23.d\n" - "mov z23.d, z7.d\n" - "add z23.s, z23.s, z0.s\n" - "add z12.s, z12.s, z1.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z3.s\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "mov z23.d, z28.d\n" + "add z23.s, z23.s, z27.s\n" + "add z12.s, z12.s, z26.s\n" + "add z13.s, z13.s, z25.s\n" + "add z14.s, z14.s, z24.s\n" + "add z8.s, z8.s, z27.s\n" + "add z9.s, z9.s, z26.s\n" + "add z10.s, z10.s, z25.s\n" + "add z11.s, z11.s, z24.s\n" + "add z16.s, z16.s, z27.s\n" + "add z17.s, z17.s, z26.s\n" + "add z18.s, z18.s, z25.s\n" + "add z19.s, z19.s, z24.s\n" "tbz %x[flags], #4, 36f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -799,10 +799,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 37f\n" "36:" // Height 3: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -823,109 +823,109 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" "tbz %x[flags], #5, 38f\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z12.d, z1.d\n" - "and z6.d, z13.d, z2.d\n" - "and z7.d, z14.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z12.s, z12.s, z5.s\n" - "sqadd z13.s, z13.s, z6.s\n" - "sqadd z14.s, z14.s, z7.s\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z24.d, z23.d, z0.d\n" + "and z22.d, z12.d, z1.d\n" + "and z21.d, z13.d, z2.d\n" + "and z20.d, z14.d, z3.d\n" + "asr z24.s, z24.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z23.s, z23.s, z24.s\n" + "sqadd z12.s, z12.s, z22.s\n" + "sqadd z13.s, z13.s, z21.s\n" + "sqadd z14.s, z14.s, z20.s\n" + "and z24.d, z8.d, z0.d\n" + "and z22.d, z9.d, z1.d\n" + "and z21.d, z10.d, z2.d\n" + "and z20.d, z11.d, z3.d\n" + "asr z24.s, z24.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z8.s, z8.s, z24.s\n" + "sqadd z9.s, z9.s, z22.s\n" + "sqadd z10.s, z10.s, z21.s\n" + "sqadd z11.s, z11.s, z20.s\n" + "and z24.d, z16.d, z0.d\n" + "and z22.d, z17.d, z1.d\n" + "and z21.d, z18.d, z2.d\n" + "and z20.d, z19.d, z3.d\n" + "asr z24.s, z24.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z16.s, z16.s, z24.s\n" + "sqadd z17.s, z17.s, z22.s\n" + "sqadd z18.s, z18.s, z21.s\n" + "sqadd z19.s, z19.s, z20.s\n" "38:" // Height 3: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z21.s\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" - "add z12.s, z12.s, z4.s\n" - "add z13.s, z13.s, z4.s\n" + "add z12.s, z12.s, z21.s\n" + "add z13.s, z13.s, z21.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z14.s, z14.s, z4.s\n" - "add z8.s, z8.s, z4.s\n" + "add z14.s, z14.s, z21.s\n" + "add z8.s, z8.s, z21.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z21.s\n" + "add z10.s, z10.s, z21.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z11.s, z11.s, z21.s\n" + "add z16.s, z16.s, z21.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z21.s\n" + "add z18.s, z18.s, z21.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z19.s, z19.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "smax z13.s, p2/M, z13.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z21.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z20.s\n" + "smin z12.s, p2/M, z12.s, z20.s\n" + "smin z13.s, p2/M, z13.s, z20.s\n" + "smin z14.s, p2/M, z14.s, z20.s\n" + "smin z8.s, p2/M, z8.s, z20.s\n" + "smin z9.s, p2/M, z9.s, z20.s\n" + "smin z10.s, p2/M, z10.s, z20.s\n" + "smin z11.s, p2/M, z11.s, z20.s\n" + "smin z16.s, p2/M, z16.s, z20.s\n" + "smin z17.s, p2/M, z17.s, z20.s\n" + "smin z18.s, p2/M, z18.s, z20.s\n" + "smin z19.s, p2/M, z19.s, z20.s\n" + "smax z23.s, p2/M, z23.s, z21.s\n" + "smax z12.s, p2/M, z12.s, z21.s\n" + "smax z13.s, p2/M, z13.s, z21.s\n" "uzp1 z23.h, z23.h, z12.h\n" - "smax z14.s, p2/M, z14.s, z5.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "uzp1 z12.h, z13.h, z14.h\n" - "uzp1 z23.b, z23.b, z12.b\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z21.s\n" + "smax z8.s, p2/M, z8.s, z21.s\n" + "uzp1 z20.h, z13.h, z14.h\n" + "uzp1 z23.b, z23.b, z20.b\n" + "smax z9.s, p2/M, z9.s, z21.s\n" + "smax z10.s, p2/M, z10.s, z21.s\n" "uzp1 z8.h, z8.h, z9.h\n" "st1b { z23.b }, p1, [x11]\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z21.s\n" + "smax z16.s, p2/M, z16.s, z21.s\n" + "uzp1 z20.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z20.b\n" + "smax z17.s, p2/M, z17.s, z21.s\n" + "smax z18.s, p2/M, z18.s, z21.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z8.b }, p1, [x24]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "st1b { z8.b }, p1, [x26]\n" + "smax z19.s, p2/M, z19.s, z21.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x23]\n" + "st1b { z16.b }, p1, [x25]\n" "addvl x11, x11, #1\n" "39:" // Height 3: Writeback done "decw x10, ALL, MUL #4\n" @@ -963,14 +963,14 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "43:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 44f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 45f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -980,161 +980,161 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "b 45f\n" "44:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "45:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 47f\n" "46:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z29.d, z30.d, z24.d\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z30.d, z30.d, z24.d\n" + "trn1 z26.d, z28.d, z27.d\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z27.d\n" + ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n" + ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n" + ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-5, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n" + ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n" + ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n" + ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n" + ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n" + ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n" + ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n" + ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n" "bgt 46b\n" "47:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z28.d, z1.d, z24.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z24.d\n" + "trn1 z26.d, z3.d, z27.d\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z27.d\n" + ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" "ble 48f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x9]\n" + "ld1b { z24.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" + ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" + ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" + ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" + ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" + ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" + ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" + ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" + ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" + ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" + ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" "48:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 43b\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z28.d, z8.d, z12.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z8.d, z8.d, z12.d\n" - "ld1w { z0.s }, p2/Z, [x14]\n" + "ld1w { z27.s }, p2/Z, [x14]\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x14, #2, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add x24, x11, x20\n" + "ld1w { z24.s }, p2/Z, [x14, #3, MUL VL]\n" + "add x26, x11, x20\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" "addvl x14, x14, #4\n" @@ -1144,23 +1144,23 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "mov z23.d, z7.d\n" - "add z23.s, z23.s, z0.s\n" - "add z12.s, z12.s, z1.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z3.s\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" - "add z15.s, z15.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "mov z23.d, z28.d\n" + "add z23.s, z23.s, z27.s\n" + "add z12.s, z12.s, z26.s\n" + "add z13.s, z13.s, z25.s\n" + "add z14.s, z14.s, z24.s\n" + "add z8.s, z8.s, z27.s\n" + "add z9.s, z9.s, z26.s\n" + "add z10.s, z10.s, z25.s\n" + "add z11.s, z11.s, z24.s\n" + "add z15.s, z15.s, z27.s\n" + "add z20.s, z20.s, z26.s\n" + "add z21.s, z21.s, z25.s\n" + "add z22.s, z22.s, z24.s\n" + "add z16.s, z16.s, z27.s\n" + "add z17.s, z17.s, z26.s\n" + "add z18.s, z18.s, z25.s\n" + "add z19.s, z19.s, z24.s\n" "tbz %x[flags], #4, 49f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -1174,10 +1174,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 50f\n" "49:" // Height 4: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -1202,141 +1202,141 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n" ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n" "tbz %x[flags], #5, 51f\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z12.d, z1.d\n" - "and z6.d, z13.d, z2.d\n" - "and z7.d, z14.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z12.s, z12.s, z5.s\n" - "sqadd z13.s, z13.s, z6.s\n" - "sqadd z14.s, z14.s, z7.s\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z15.d, z0.d\n" - "and z5.d, z20.d, z1.d\n" - "and z6.d, z21.d, z2.d\n" - "and z7.d, z22.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z15.s, z15.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z27.d, z23.d, z0.d\n" + "and z26.d, z12.d, z1.d\n" + "and z25.d, z13.d, z2.d\n" + "and z24.d, z14.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z23.s, z23.s, z27.s\n" + "sqadd z12.s, z12.s, z26.s\n" + "sqadd z13.s, z13.s, z25.s\n" + "sqadd z14.s, z14.s, z24.s\n" + "and z27.d, z8.d, z0.d\n" + "and z26.d, z9.d, z1.d\n" + "and z25.d, z10.d, z2.d\n" + "and z24.d, z11.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z8.s, z8.s, z27.s\n" + "sqadd z9.s, z9.s, z26.s\n" + "sqadd z10.s, z10.s, z25.s\n" + "sqadd z11.s, z11.s, z24.s\n" + "and z27.d, z15.d, z0.d\n" + "and z26.d, z20.d, z1.d\n" + "and z25.d, z21.d, z2.d\n" + "and z24.d, z22.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z15.s, z15.s, z27.s\n" + "sqadd z20.s, z20.s, z26.s\n" + "sqadd z21.s, z21.s, z25.s\n" + "sqadd z22.s, z22.s, z24.s\n" + "and z27.d, z16.d, z0.d\n" + "and z26.d, z17.d, z1.d\n" + "and z25.d, z18.d, z2.d\n" + "and z24.d, z19.d, z3.d\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z16.s, z16.s, z27.s\n" + "sqadd z17.s, z17.s, z26.s\n" + "sqadd z18.s, z18.s, z25.s\n" + "sqadd z19.s, z19.s, z24.s\n" "51:" // Height 4: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z25.s\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" - "add z12.s, z12.s, z4.s\n" - "add z13.s, z13.s, z4.s\n" + "add z12.s, z12.s, z25.s\n" + "add z13.s, z13.s, z25.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z14.s, z14.s, z4.s\n" - "add z8.s, z8.s, z4.s\n" + "add z14.s, z14.s, z25.s\n" + "add z8.s, z8.s, z25.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z25.s\n" + "add z10.s, z10.s, z25.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z15.s, z15.s, z4.s\n" + "add z11.s, z11.s, z25.s\n" + "add z15.s, z15.s, z25.s\n" ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z25.s\n" + "add z21.s, z21.s, z25.s\n" ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z25.s\n" + "add z16.s, z16.s, z25.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z25.s\n" + "add z18.s, z18.s, z25.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z19.s, z19.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "smax z13.s, p2/M, z13.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z25.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z24.s\n" + "smin z12.s, p2/M, z12.s, z24.s\n" + "smin z13.s, p2/M, z13.s, z24.s\n" + "smin z14.s, p2/M, z14.s, z24.s\n" + "smin z8.s, p2/M, z8.s, z24.s\n" + "smin z9.s, p2/M, z9.s, z24.s\n" + "smin z10.s, p2/M, z10.s, z24.s\n" + "smin z11.s, p2/M, z11.s, z24.s\n" + "smin z15.s, p2/M, z15.s, z24.s\n" + "smin z20.s, p2/M, z20.s, z24.s\n" + "smin z21.s, p2/M, z21.s, z24.s\n" + "smin z22.s, p2/M, z22.s, z24.s\n" + "smin z16.s, p2/M, z16.s, z24.s\n" + "smin z17.s, p2/M, z17.s, z24.s\n" + "smin z18.s, p2/M, z18.s, z24.s\n" + "smin z19.s, p2/M, z19.s, z24.s\n" + "smax z23.s, p2/M, z23.s, z25.s\n" + "smax z12.s, p2/M, z12.s, z25.s\n" + "smax z13.s, p2/M, z13.s, z25.s\n" "uzp1 z23.h, z23.h, z12.h\n" - "smax z14.s, p2/M, z14.s, z5.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "uzp1 z12.h, z13.h, z14.h\n" - "uzp1 z23.b, z23.b, z12.b\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z25.s\n" + "smax z8.s, p2/M, z8.s, z25.s\n" + "uzp1 z24.h, z13.h, z14.h\n" + "uzp1 z23.b, z23.b, z24.b\n" + "smax z9.s, p2/M, z9.s, z25.s\n" + "smax z10.s, p2/M, z10.s, z25.s\n" "uzp1 z8.h, z8.h, z9.h\n" "st1b { z23.b }, p1, [x11]\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z25.s\n" + "smax z15.s, p2/M, z15.s, z25.s\n" + "uzp1 z23.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z23.b\n" + "smax z20.s, p2/M, z20.s, z25.s\n" + "smax z21.s, p2/M, z21.s, z25.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "st1b { z8.b }, p1, [x24]\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "st1b { z8.b }, p1, [x26]\n" + "smax z22.s, p2/M, z22.s, z25.s\n" + "smax z16.s, p2/M, z16.s, z25.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z15.b, z15.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z25.s\n" + "smax z18.s, p2/M, z18.s, z25.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z15.b }, p1, [x23]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "st1b { z15.b }, p1, [x25]\n" + "smax z19.s, p2/M, z19.s, z25.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x22]\n" + "st1b { z16.b }, p1, [x24]\n" "addvl x11, x11, #1\n" "52:" // Height 4: Writeback done "decw x10, ALL, MUL #4\n" @@ -1382,15 +1382,15 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "56:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 57f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 58f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1401,204 +1401,204 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "b 58f\n" "57:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "58:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 60f\n" "59:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1rqb { z6.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z5.d, z6.d, z1.d\n" + "trn2 z6.d, z6.d, z1.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "trn1 z3.d, z7.d, z2.d\n" + "trn2 z7.d, z7.d, z2.d\n" + "ld1b { z1.b }, p2/Z, [x9]\n" + "trn1 z2.d, z4.d, z0.d\n" + "trn2 z4.d, z4.d, z0.d\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n" + ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n" + ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n" + ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n" + ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n" + ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n" + ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n" + ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" + ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n" + ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" + ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" + ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n" + ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" + ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" + ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n" + ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" + ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" + ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n" + ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n" + ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" + ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "bgt 59b\n" "60:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z7.d, z1.d, z4.d\n" + "trn2 z1.d, z1.d, z4.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "trn1 z6.d, z3.d, z2.d\n" + "trn2 z3.d, z3.d, z2.d\n" + "ld1b { z2.b }, p2/Z, [x9]\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" + ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" + ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" + ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" + ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" + ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" + ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" "addvl x9, x9, #8\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" + ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "ble 61f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1b { z2.b }, p2/Z, [x9]\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" + ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" + ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" + ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" + ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" + ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" + ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" + ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" + ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" + ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" + ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" + ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" + ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" + ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" + ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" "61:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 56b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z4.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "add x24, x11, x20\n" + "add x26, x11, x20\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "ld1w { z0.s }, p2/Z, [x14]\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x14]\n" + "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x20\n" + "add x24, x25, x20\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x21, x22, x20\n" + "add x23, x24, x20\n" "addvl x14, x14, #4\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" @@ -1610,27 +1610,27 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" - "add z31.s, z31.s, z0.s\n" - "add z12.s, z12.s, z1.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z3.s\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" - "add z15.s, z15.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" + "mov z31.d, z4.d\n" + "add z31.s, z31.s, z3.s\n" + "add z12.s, z12.s, z2.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z0.s\n" + "add z8.s, z8.s, z3.s\n" + "add z9.s, z9.s, z2.s\n" + "add z10.s, z10.s, z1.s\n" + "add z11.s, z11.s, z0.s\n" + "add z15.s, z15.s, z3.s\n" + "add z20.s, z20.s, z2.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z0.s\n" + "add z16.s, z16.s, z3.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z1.s\n" + "add z19.s, z19.s, z0.s\n" + "add z24.s, z24.s, z3.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z1.s\n" + "add z27.s, z27.s, z0.s\n" "tbz %x[flags], #4, 62f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -1644,10 +1644,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 63f\n" "62:" // Height 5: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -1676,173 +1676,173 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" "tbz %x[flags], #5, 64f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z12.d, z1.d\n" - "and z6.d, z13.d, z2.d\n" - "and z7.d, z14.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z12.s, z12.s, z5.s\n" - "sqadd z13.s, z13.s, z6.s\n" - "sqadd z14.s, z14.s, z7.s\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z15.d, z0.d\n" - "and z5.d, z20.d, z1.d\n" - "and z6.d, z21.d, z2.d\n" - "and z7.d, z22.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z15.s, z15.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "and z4.d, z24.d, z0.d\n" - "and z5.d, z25.d, z1.d\n" - "and z6.d, z26.d, z2.d\n" - "and z7.d, z27.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z5.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z7.s\n" + "and z30.d, z31.d, z0.d\n" + "and z29.d, z12.d, z1.d\n" + "and z28.d, z13.d, z2.d\n" + "and z23.d, z14.d, z3.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z31.s, z31.s, z30.s\n" + "sqadd z12.s, z12.s, z29.s\n" + "sqadd z13.s, z13.s, z28.s\n" + "sqadd z14.s, z14.s, z23.s\n" + "and z30.d, z8.d, z0.d\n" + "and z29.d, z9.d, z1.d\n" + "and z28.d, z10.d, z2.d\n" + "and z23.d, z11.d, z3.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z8.s, z8.s, z30.s\n" + "sqadd z9.s, z9.s, z29.s\n" + "sqadd z10.s, z10.s, z28.s\n" + "sqadd z11.s, z11.s, z23.s\n" + "and z30.d, z15.d, z0.d\n" + "and z29.d, z20.d, z1.d\n" + "and z28.d, z21.d, z2.d\n" + "and z23.d, z22.d, z3.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z15.s, z15.s, z30.s\n" + "sqadd z20.s, z20.s, z29.s\n" + "sqadd z21.s, z21.s, z28.s\n" + "sqadd z22.s, z22.s, z23.s\n" + "and z30.d, z16.d, z0.d\n" + "and z29.d, z17.d, z1.d\n" + "and z28.d, z18.d, z2.d\n" + "and z23.d, z19.d, z3.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z16.s, z16.s, z30.s\n" + "sqadd z17.s, z17.s, z29.s\n" + "sqadd z18.s, z18.s, z28.s\n" + "sqadd z19.s, z19.s, z23.s\n" + "and z30.d, z24.d, z0.d\n" + "and z29.d, z25.d, z1.d\n" + "and z28.d, z26.d, z2.d\n" + "and z23.d, z27.d, z3.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z24.s, z24.s, z30.s\n" + "sqadd z25.s, z25.s, z29.s\n" + "sqadd z26.s, z26.s, z28.s\n" + "sqadd z27.s, z27.s, z23.s\n" "64:" // Height 5: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add z31.s, z31.s, z4.s\n" + "add z31.s, z31.s, z28.s\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" ".inst 0x4482884d // srshl z13.s, p2/M, z13.s, z2.s\n" - "add z12.s, z12.s, z4.s\n" - "add z13.s, z13.s, z4.s\n" + "add z12.s, z12.s, z28.s\n" + "add z13.s, z13.s, z28.s\n" ".inst 0x4482886e // srshl z14.s, p2/M, z14.s, z3.s\n" ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n" - "add z14.s, z14.s, z4.s\n" - "add z8.s, z8.s, z4.s\n" + "add z14.s, z14.s, z28.s\n" + "add z8.s, z8.s, z28.s\n" ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n" ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n" - "add z9.s, z9.s, z4.s\n" - "add z10.s, z10.s, z4.s\n" + "add z9.s, z9.s, z28.s\n" + "add z10.s, z10.s, z28.s\n" ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n" ".inst 0x4482880f // srshl z15.s, p2/M, z15.s, z0.s\n" - "add z11.s, z11.s, z4.s\n" - "add z15.s, z15.s, z4.s\n" + "add z11.s, z11.s, z28.s\n" + "add z15.s, z15.s, z28.s\n" ".inst 0x44828834 // srshl z20.s, p2/M, z20.s, z1.s\n" ".inst 0x44828855 // srshl z21.s, p2/M, z21.s, z2.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z28.s\n" + "add z21.s, z21.s, z28.s\n" ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z28.s\n" + "add z16.s, z16.s, z28.s\n" ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n" ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z28.s\n" + "add z18.s, z18.s, z28.s\n" ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z19.s, z19.s, z28.s\n" + "add z24.s, z24.s, z28.s\n" ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n" ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z28.s\n" + "add z26.s, z26.s, z28.s\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" - "add z27.s, z27.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "smax z13.s, p2/M, z13.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z28.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z23.s\n" + "smin z12.s, p2/M, z12.s, z23.s\n" + "smin z13.s, p2/M, z13.s, z23.s\n" + "smin z14.s, p2/M, z14.s, z23.s\n" + "smin z8.s, p2/M, z8.s, z23.s\n" + "smin z9.s, p2/M, z9.s, z23.s\n" + "smin z10.s, p2/M, z10.s, z23.s\n" + "smin z11.s, p2/M, z11.s, z23.s\n" + "smin z15.s, p2/M, z15.s, z23.s\n" + "smin z20.s, p2/M, z20.s, z23.s\n" + "smin z21.s, p2/M, z21.s, z23.s\n" + "smin z22.s, p2/M, z22.s, z23.s\n" + "smin z16.s, p2/M, z16.s, z23.s\n" + "smin z17.s, p2/M, z17.s, z23.s\n" + "smin z18.s, p2/M, z18.s, z23.s\n" + "smin z19.s, p2/M, z19.s, z23.s\n" + "smin z24.s, p2/M, z24.s, z23.s\n" + "smin z25.s, p2/M, z25.s, z23.s\n" + "smin z26.s, p2/M, z26.s, z23.s\n" + "smin z27.s, p2/M, z27.s, z23.s\n" + "smax z31.s, p2/M, z31.s, z28.s\n" + "smax z12.s, p2/M, z12.s, z28.s\n" + "smax z13.s, p2/M, z13.s, z28.s\n" "uzp1 z31.h, z31.h, z12.h\n" - "smax z14.s, p2/M, z14.s, z5.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "uzp1 z12.h, z13.h, z14.h\n" - "uzp1 z31.b, z31.b, z12.b\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z28.s\n" + "smax z8.s, p2/M, z8.s, z28.s\n" + "uzp1 z23.h, z13.h, z14.h\n" + "uzp1 z31.b, z31.b, z23.b\n" + "smax z9.s, p2/M, z9.s, z28.s\n" + "smax z10.s, p2/M, z10.s, z28.s\n" "uzp1 z8.h, z8.h, z9.h\n" "st1b { z31.b }, p1, [x11]\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z28.s\n" + "smax z15.s, p2/M, z15.s, z28.s\n" + "uzp1 z23.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z23.b\n" + "smax z20.s, p2/M, z20.s, z28.s\n" + "smax z21.s, p2/M, z21.s, z28.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "st1b { z8.b }, p1, [x24]\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "st1b { z8.b }, p1, [x26]\n" + "smax z22.s, p2/M, z22.s, z28.s\n" + "smax z16.s, p2/M, z16.s, z28.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z15.b, z15.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z28.s\n" + "smax z18.s, p2/M, z18.s, z28.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z15.b }, p1, [x23]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" + "st1b { z15.b }, p1, [x25]\n" + "smax z19.s, p2/M, z19.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z28.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z28.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x21]\n" + "st1b { z16.b }, p1, [x24]\n" + "smax z27.s, p2/M, z27.s, z28.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x23]\n" "addvl x11, x11, #1\n" "65:" // Height 5: Writeback done "decw x10, ALL, MUL #4\n" @@ -1891,16 +1891,16 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "69:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 70f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 71f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1912,209 +1912,209 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "b 71f\n" "70:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "71:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 73f\n" "72:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z6.d, z7.d, z0.d\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn2 z7.d, z7.d, z0.d\n" + "trn1 z4.d, z5.d, z1.d\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z5.d, z5.d, z1.d\n" + "trn1 z2.d, z3.d, z0.d\n" + "trn2 z3.d, z3.d, z0.d\n" + "ld1b { z1.b }, p2/Z, [x9]\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" + ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n" + ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" + ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n" + ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" + ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" "add x21, x21, #0x10\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n" + ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" + ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n" + ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-8, MUL VL]\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-6, MUL VL]\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" + ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n" + ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-8, MUL VL]\n" + ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" + ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n" + ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-7, MUL VL]\n" + ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n" + ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n" + ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-6, MUL VL]\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n" + ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-5, MUL VL]\n" + ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n" + ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n" + ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n" + ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-3, MUL VL]\n" + ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n" + ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n" + ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [x9, #-2, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n" + ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #-1, MUL VL]\n" + ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n" + ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n" + ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n" + ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n" "bgt 72b\n" "73:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z7.d, z1.d, z0.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z0.d\n" + "trn1 z6.d, z3.d, z2.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z3.d, z3.d, z2.d\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z2.b }, p2/Z, [x9]\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" + ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" + ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" + ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" + ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" + ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" + ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" + ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" "addvl x9, x9, #8\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" + ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "ble 74f\n" - "ld1b { z7.b }, p2/Z, [x9]\n" - "ld1b { z6.b }, p2/Z, [x9, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #2, MUL VL]\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #4, MUL VL]\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x9, #6, MUL VL]\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x9, #7, MUL VL]\n" + "ld1b { z2.b }, p2/Z, [x9]\n" + "ld1b { z0.b }, p2/Z, [x9, #1, MUL VL]\n" + ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" + ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" + ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" + ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #2, MUL VL]\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" + ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #3, MUL VL]\n" + ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" + ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" + ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" + ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #4, MUL VL]\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" + ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #5, MUL VL]\n" + ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" + ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" + ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" + ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p2/Z, [x9, #6, MUL VL]\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [x9, #7, MUL VL]\n" "addvl x9, x9, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" + ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" + ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" + ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" "74:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 69b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z8.d, z12.d\n" - "add x24, x11, x20\n" + "uzp1 z4.d, z8.d, z12.d\n" + "add x26, x11, x20\n" "uzp2 z8.d, z8.d, z12.d\n" "uzp1 z12.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "add x23, x24, x20\n" - "ld1w { z0.s }, p2/Z, [x14]\n" + "add x25, x26, x20\n" + "ld1w { z3.s }, p2/Z, [x14]\n" "uzp1 z13.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "ld1w { z1.s }, p2/Z, [x14, #1, MUL VL]\n" - "ld1w { z2.s }, p2/Z, [x14, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x14, #1, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [x14, #2, MUL VL]\n" "uzp1 z14.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "ld1w { z3.s }, p2/Z, [x14, #3, MUL VL]\n" - "add x22, x23, x20\n" + "ld1w { z0.s }, p2/Z, [x14, #3, MUL VL]\n" + "add x24, x25, x20\n" "uzp1 z15.d, z16.d, z20.d\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x21, x22, x20\n" - "add x20, x21, x20\n" + "add x23, x24, x20\n" + "add x22, x23, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "addvl x14, x14, #4\n" @@ -2130,31 +2130,31 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" - "add z31.s, z31.s, z0.s\n" - "add z12.s, z12.s, z1.s\n" - "add z13.s, z13.s, z2.s\n" - "add z14.s, z14.s, z3.s\n" - "add z8.s, z8.s, z0.s\n" - "add z9.s, z9.s, z1.s\n" - "add z10.s, z10.s, z2.s\n" - "add z11.s, z11.s, z3.s\n" - "add z15.s, z15.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z23.s, z23.s, z0.s\n" - "add z28.s, z28.s, z1.s\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" + "mov z31.d, z4.d\n" + "add z31.s, z31.s, z3.s\n" + "add z12.s, z12.s, z2.s\n" + "add z13.s, z13.s, z1.s\n" + "add z14.s, z14.s, z0.s\n" + "add z8.s, z8.s, z3.s\n" + "add z9.s, z9.s, z2.s\n" + "add z10.s, z10.s, z1.s\n" + "add z11.s, z11.s, z0.s\n" + "add z15.s, z15.s, z3.s\n" + "add z20.s, z20.s, z2.s\n" + "add z21.s, z21.s, z1.s\n" + "add z22.s, z22.s, z0.s\n" + "add z16.s, z16.s, z3.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z1.s\n" + "add z19.s, z19.s, z0.s\n" + "add z23.s, z23.s, z3.s\n" + "add z28.s, z28.s, z2.s\n" + "add z29.s, z29.s, z1.s\n" + "add z30.s, z30.s, z0.s\n" + "add z24.s, z24.s, z3.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z1.s\n" + "add z27.s, z27.s, z0.s\n" "tbz %x[flags], #4, 75f\n" "ld1w { z0.s }, p2/Z, [x12]\n" "ld1w { z4.s }, p2/Z, [x13]\n" @@ -2168,10 +2168,10 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "addvl x13, x13, #4\n" "b 76f\n" "75:" // Height 6: per layer parameters - "add x26, %x[qp], %[per_layer_right_shift]\n" - "add x25, %x[qp], %[per_layer_mul]\n" - "ld1rw { z0.s }, p2/Z, [x26]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x21, %x[qp], %[per_layer_right_shift]\n" + "add x20, %x[qp], %[per_layer_mul]\n" + "ld1rw { z0.s }, p2/Z, [x21]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" "mov z1.d, z0.d\n" "mov z5.d, z4.d\n" "mov z2.d, z0.d\n" @@ -2204,81 +2204,81 @@ void sve_hybrid_s8qs_mmla_6x4VL ( ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n" ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n" "tbz %x[flags], #5, 77f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z12.d, z1.d\n" - "and z6.d, z13.d, z2.d\n" - "and z7.d, z14.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" + "and z7.d, z31.d, z0.d\n" + "and z6.d, z12.d, z1.d\n" + "and z5.d, z13.d, z2.d\n" + "and z4.d, z14.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z12.s, z12.s, z5.s\n" - "sqadd z13.s, z13.s, z6.s\n" - "sqadd z14.s, z14.s, z7.s\n" - "and z4.d, z8.d, z0.d\n" - "and z5.d, z9.d, z1.d\n" - "and z6.d, z10.d, z2.d\n" - "and z7.d, z11.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z8.s, z8.s, z4.s\n" - "sqadd z9.s, z9.s, z5.s\n" - "sqadd z10.s, z10.s, z6.s\n" - "sqadd z11.s, z11.s, z7.s\n" - "and z4.d, z15.d, z0.d\n" - "and z5.d, z20.d, z1.d\n" - "and z6.d, z21.d, z2.d\n" - "and z7.d, z22.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z15.s, z15.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z1.d\n" - "and z6.d, z18.d, z2.d\n" - "and z7.d, z19.d, z3.d\n" "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" + "sqadd z31.s, z31.s, z7.s\n" + "sqadd z12.s, z12.s, z6.s\n" + "sqadd z13.s, z13.s, z5.s\n" + "sqadd z14.s, z14.s, z4.s\n" + "and z7.d, z8.d, z0.d\n" + "and z6.d, z9.d, z1.d\n" + "and z5.d, z10.d, z2.d\n" + "and z4.d, z11.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z28.d, z1.d\n" - "and z6.d, z29.d, z2.d\n" - "and z7.d, z30.d, z3.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z8.s, z8.s, z7.s\n" + "sqadd z9.s, z9.s, z6.s\n" + "sqadd z10.s, z10.s, z5.s\n" + "sqadd z11.s, z11.s, z4.s\n" + "and z7.d, z15.d, z0.d\n" + "and z6.d, z20.d, z1.d\n" + "and z5.d, z21.d, z2.d\n" + "and z4.d, z22.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z28.s, z28.s, z5.s\n" - "sqadd z29.s, z29.s, z6.s\n" - "sqadd z30.s, z30.s, z7.s\n" - "and z4.d, z24.d, z0.d\n" - "and z5.d, z25.d, z1.d\n" - "and z6.d, z26.d, z2.d\n" - "and z7.d, z27.d, z3.d\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "sqadd z15.s, z15.s, z7.s\n" + "sqadd z20.s, z20.s, z6.s\n" + "sqadd z21.s, z21.s, z5.s\n" + "sqadd z22.s, z22.s, z4.s\n" + "and z7.d, z16.d, z0.d\n" + "and z6.d, z17.d, z1.d\n" + "and z5.d, z18.d, z2.d\n" + "and z4.d, z19.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z16.s, z16.s, z7.s\n" + "sqadd z17.s, z17.s, z6.s\n" + "sqadd z18.s, z18.s, z5.s\n" + "sqadd z19.s, z19.s, z4.s\n" + "and z7.d, z23.d, z0.d\n" + "and z6.d, z28.d, z1.d\n" + "and z5.d, z29.d, z2.d\n" + "and z4.d, z30.d, z3.d\n" + "asr z7.s, z7.s, #0x1f\n" "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z23.s, z23.s, z7.s\n" + "sqadd z28.s, z28.s, z6.s\n" + "sqadd z29.s, z29.s, z5.s\n" + "sqadd z30.s, z30.s, z4.s\n" + "and z7.d, z24.d, z0.d\n" + "and z6.d, z25.d, z1.d\n" + "and z5.d, z26.d, z2.d\n" + "and z4.d, z27.d, z3.d\n" "asr z7.s, z7.s, #0x1f\n" - "sqadd z24.s, z24.s, z4.s\n" - "sqadd z25.s, z25.s, z5.s\n" - "sqadd z26.s, z26.s, z6.s\n" - "sqadd z27.s, z27.s, z7.s\n" + "asr z6.s, z6.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" + "asr z4.s, z4.s, #0x1f\n" + "sqadd z24.s, z24.s, z7.s\n" + "sqadd z25.s, z25.s, z6.s\n" + "sqadd z26.s, z26.s, z5.s\n" + "sqadd z27.s, z27.s, z4.s\n" "77:" // Height 6: no shift correction - "add x25, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z4.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" "add z31.s, z31.s, z4.s\n" ".inst 0x4482882c // srshl z12.s, p2/M, z12.s, z1.s\n" @@ -2326,83 +2326,83 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "add z25.s, z25.s, z4.s\n" "add z26.s, z26.s, z4.s\n" ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n" - "add x25, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x25]\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" "add z27.s, z27.s, z4.s\n" - "add x25, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x25]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z12.s, p2/M, z12.s, z6.s\n" - "smin z13.s, p2/M, z13.s, z6.s\n" - "smin z14.s, p2/M, z14.s, z6.s\n" - "smin z8.s, p2/M, z8.s, z6.s\n" - "smin z9.s, p2/M, z9.s, z6.s\n" - "smin z10.s, p2/M, z10.s, z6.s\n" - "smin z11.s, p2/M, z11.s, z6.s\n" - "smin z15.s, p2/M, z15.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z12.s, p2/M, z12.s, z5.s\n" - "smax z13.s, p2/M, z13.s, z5.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z0.s\n" + "smin z12.s, p2/M, z12.s, z0.s\n" + "smin z13.s, p2/M, z13.s, z0.s\n" + "smin z14.s, p2/M, z14.s, z0.s\n" + "smin z8.s, p2/M, z8.s, z0.s\n" + "smin z9.s, p2/M, z9.s, z0.s\n" + "smin z10.s, p2/M, z10.s, z0.s\n" + "smin z11.s, p2/M, z11.s, z0.s\n" + "smin z15.s, p2/M, z15.s, z0.s\n" + "smin z20.s, p2/M, z20.s, z0.s\n" + "smin z21.s, p2/M, z21.s, z0.s\n" + "smin z22.s, p2/M, z22.s, z0.s\n" + "smin z16.s, p2/M, z16.s, z0.s\n" + "smin z17.s, p2/M, z17.s, z0.s\n" + "smin z18.s, p2/M, z18.s, z0.s\n" + "smin z19.s, p2/M, z19.s, z0.s\n" + "smin z23.s, p2/M, z23.s, z0.s\n" + "smin z28.s, p2/M, z28.s, z0.s\n" + "smin z29.s, p2/M, z29.s, z0.s\n" + "smin z30.s, p2/M, z30.s, z0.s\n" + "smin z24.s, p2/M, z24.s, z0.s\n" + "smin z25.s, p2/M, z25.s, z0.s\n" + "smin z26.s, p2/M, z26.s, z0.s\n" + "smin z27.s, p2/M, z27.s, z0.s\n" + "smax z31.s, p2/M, z31.s, z1.s\n" + "smax z12.s, p2/M, z12.s, z1.s\n" + "smax z13.s, p2/M, z13.s, z1.s\n" "uzp1 z31.h, z31.h, z12.h\n" - "smax z14.s, p2/M, z14.s, z5.s\n" - "smax z8.s, p2/M, z8.s, z5.s\n" - "uzp1 z12.h, z13.h, z14.h\n" - "uzp1 z31.b, z31.b, z12.b\n" - "smax z9.s, p2/M, z9.s, z5.s\n" - "smax z10.s, p2/M, z10.s, z5.s\n" + "smax z14.s, p2/M, z14.s, z1.s\n" + "smax z8.s, p2/M, z8.s, z1.s\n" + "uzp1 z0.h, z13.h, z14.h\n" + "uzp1 z31.b, z31.b, z0.b\n" + "smax z9.s, p2/M, z9.s, z1.s\n" + "smax z10.s, p2/M, z10.s, z1.s\n" "uzp1 z8.h, z8.h, z9.h\n" "st1b { z31.b }, p1, [x11]\n" - "smax z11.s, p2/M, z11.s, z5.s\n" - "smax z15.s, p2/M, z15.s, z5.s\n" - "uzp1 z9.h, z10.h, z11.h\n" - "uzp1 z8.b, z8.b, z9.b\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "smax z11.s, p2/M, z11.s, z1.s\n" + "smax z15.s, p2/M, z15.s, z1.s\n" + "uzp1 z31.h, z10.h, z11.h\n" + "uzp1 z8.b, z8.b, z31.b\n" + "smax z20.s, p2/M, z20.s, z1.s\n" + "smax z21.s, p2/M, z21.s, z1.s\n" "uzp1 z15.h, z15.h, z20.h\n" - "st1b { z8.b }, p1, [x24]\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "st1b { z8.b }, p1, [x26]\n" + "smax z22.s, p2/M, z22.s, z1.s\n" + "smax z16.s, p2/M, z16.s, z1.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z15.b, z15.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z1.s\n" + "smax z18.s, p2/M, z18.s, z1.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "st1b { z15.b }, p1, [x23]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" + "st1b { z15.b }, p1, [x25]\n" + "smax z19.s, p2/M, z19.s, z1.s\n" + "smax z23.s, p2/M, z23.s, z1.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z1.s\n" + "smax z29.s, p2/M, z29.s, z1.s\n" "uzp1 z23.h, z23.h, z28.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z30.s, p2/M, z30.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z28.h, z29.h, z30.h\n" - "uzp1 z23.b, z23.b, z28.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "st1b { z16.b }, p1, [x24]\n" + "smax z30.s, p2/M, z30.s, z1.s\n" + "smax z24.s, p2/M, z24.s, z1.s\n" + "uzp1 z16.h, z29.h, z30.h\n" + "uzp1 z23.b, z23.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z1.s\n" + "smax z26.s, p2/M, z26.s, z1.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z23.b }, p1, [x21]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x20]\n" + "st1b { z23.b }, p1, [x23]\n" + "smax z27.s, p2/M, z27.s, z1.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x22]\n" "addvl x11, x11, #1\n" "78:" // Height 6: Writeback done "decw x10, ALL, MUL #4\n" @@ -2420,7 +2420,6 @@ void sve_hybrid_s8qs_mmla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "80:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -2428,4 +2427,4 @@ void sve_hybrid_s8qs_mmla_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp index 28057aa961..cfa349f3aa 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -39,6 +39,7 @@ namespace arm_gemm { // Actual kernel implementations void sve_hybrid_s8s32_dot_6x4VL( ARGLIST ); +void sve_hybrid_s8s32_dot_6x4VL_a64fx( ARGLIST ); class cls_sve_hybrid_s8s32_dot_6x4VL { @@ -74,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -83,10 +83,11 @@ public: return { 20.92 }; case CPUModel::V1: return { 62.24 }; + case CPUModel::A64FX: + return { 94.32 }; } } - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -95,6 +96,8 @@ public: return { 22.77, 3.90, 0.47 }; case CPUModel::V1: return { 48.09, 16.24, 0.83 }; + case CPUModel::A64FX: + return { 100.19, 3.13, 0.43 }; } } @@ -103,13 +106,19 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8s32_dot_6x4VL; - cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *) + cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_hybrid_s8s32_dot_6x4VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp index 51e9aa1b40..1a483210f3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp @@ -115,11 +115,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -135,12 +135,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "8:" // Height 1: Multiply loop: Main loop "sdot z8.s, z6.b, z0.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x26, x26, #0x4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z11.s, z7.b, z0.b\n" + "sdot z10.s, z17.b, z0.b\n" + "sdot z11.s, z16.b, z0.b\n" "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" @@ -150,12 +150,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "sdot z8.s, z6.b, z0.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z11.s, z7.b, z0.b\n" + "sdot z10.s, z17.b, z0.b\n" + "sdot z11.s, z16.b, z0.b\n" "addvl x10, x10, #4\n" "bne 5b\n" "st1w { z8.s }, p3, [x9]\n" @@ -183,15 +183,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 13f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x20]\n" + "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n" "b 14f\n" "13:" // Height 2: no accumulate "mov z8.s, #0x0\n" @@ -207,12 +207,12 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "15:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -220,7 +220,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "b 17f\n" "16:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "17:" // Height 2: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -231,18 +231,18 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "18:" // Height 2: Multiply loop: Main loop "sdot z8.s, z6.b, z0.b\n" "sdot z12.s, z6.b, z1.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" "add x26, x26, #0x4\n" "sdot z9.s, z7.b, z0.b\n" "sdot z13.s, z7.b, z1.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "subs x27, x27, #0x4\n" "add x25, x25, #0x4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" + "sdot z10.s, z17.b, z0.b\n" + "sdot z14.s, z17.b, z1.b\n" + "sdot z11.s, z16.b, z0.b\n" + "sdot z15.s, z16.b, z1.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1b { z6.b }, p4/Z, [x10]\n" @@ -252,29 +252,29 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "sdot z8.s, z6.b, z0.b\n" "sdot z12.s, z6.b, z1.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" "sdot z9.s, z7.b, z0.b\n" "sdot z13.s, z7.b, z1.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" + "sdot z10.s, z17.b, z0.b\n" + "sdot z14.s, z17.b, z1.b\n" "addvl x10, x10, #4\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" + "sdot z11.s, z16.b, z0.b\n" + "sdot z15.s, z16.b, z1.b\n" "bne 15b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x20]\n" + "st1w { z13.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x20, #3, MUL VL]\n" "20:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -295,20 +295,20 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x21]\n" + "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x20]\n" + "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n" "b 24f\n" "23:" // Height 3: no accumulate "mov z8.s, #0x0\n" @@ -328,13 +328,13 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "25:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 26f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 27f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -343,8 +343,8 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "b 27f\n" "26:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "27:" // Height 3: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -360,21 +360,21 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "subs x27, x27, #0x4\n" "sdot z16.s, z6.b, z2.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "sdot z13.s, z7.b, z1.b\n" "sdot z17.s, z7.b, z2.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x24, x24, #0x4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z11.s, z7.b, z0.b\n" + "sdot z10.s, z21.b, z0.b\n" + "sdot z14.s, z21.b, z1.b\n" + "sdot z18.s, z21.b, z2.b\n" + "sdot z11.s, z20.b, z0.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "sdot z15.s, z7.b, z1.b\n" - "sdot z19.s, z7.b, z2.b\n" + "sdot z15.s, z20.b, z1.b\n" + "sdot z19.s, z20.b, z2.b\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -386,35 +386,35 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "add x28, x28, #0x1\n" "sdot z16.s, z6.b, z2.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "sdot z13.s, z7.b, z1.b\n" "sdot z17.s, z7.b, z2.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" - "sdot z19.s, z7.b, z2.b\n" + "sdot z10.s, z21.b, z0.b\n" + "sdot z14.s, z21.b, z1.b\n" + "sdot z18.s, z21.b, z2.b\n" + "sdot z11.s, z20.b, z0.b\n" + "sdot z15.s, z20.b, z1.b\n" + "sdot z19.s, z20.b, z2.b\n" "bne 25b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x21]\n" + "st1w { z13.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x20]\n" + "st1w { z17.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x20, #3, MUL VL]\n" "30:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -435,25 +435,25 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x22]\n" - "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x22]\n" + "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x21]\n" + "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x20]\n" + "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n" "b 34f\n" "33:" // Height 4: no accumulate "mov z8.s, #0x0\n" @@ -477,14 +477,14 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "35:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -494,9 +494,9 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "b 37f\n" "36:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "37:" // Height 4: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -513,7 +513,7 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "subs x27, x27, #0x4\n" "sdot z16.s, z6.b, z2.b\n" "sdot z20.s, z6.b, z3.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "sdot z9.s, z7.b, z0.b\n" "sdot z13.s, z7.b, z1.b\n" @@ -521,19 +521,19 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "add x23, x23, #0x4\n" "sdot z17.s, z7.b, z2.b\n" "sdot z21.s, z7.b, z3.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z22.s, z6.b, z3.b\n" + "sdot z10.s, z25.b, z0.b\n" + "sdot z14.s, z25.b, z1.b\n" + "sdot z18.s, z25.b, z2.b\n" + "sdot z22.s, z25.b, z3.b\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" + "sdot z11.s, z24.b, z0.b\n" + "sdot z15.s, z24.b, z1.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" - "sdot z19.s, z7.b, z2.b\n" - "sdot z23.s, z7.b, z3.b\n" + "sdot z19.s, z24.b, z2.b\n" + "sdot z23.s, z24.b, z3.b\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -545,44 +545,44 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "add x28, x28, #0x1\n" "sdot z16.s, z6.b, z2.b\n" "sdot z20.s, z6.b, z3.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "sdot z9.s, z7.b, z0.b\n" "sdot z13.s, z7.b, z1.b\n" "sdot z17.s, z7.b, z2.b\n" "sdot z21.s, z7.b, z3.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z22.s, z6.b, z3.b\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" - "sdot z19.s, z7.b, z2.b\n" - "sdot z23.s, z7.b, z3.b\n" + "sdot z10.s, z25.b, z0.b\n" + "sdot z14.s, z25.b, z1.b\n" + "sdot z18.s, z25.b, z2.b\n" + "sdot z22.s, z25.b, z3.b\n" + "sdot z11.s, z24.b, z0.b\n" + "sdot z15.s, z24.b, z1.b\n" + "sdot z19.s, z24.b, z2.b\n" + "sdot z23.s, z24.b, z3.b\n" "bne 35b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p3, [x22]\n" - "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x22]\n" + "st1w { z13.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x21]\n" + "st1w { z17.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x20]\n" + "st1w { z21.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x20, #3, MUL VL]\n" "40:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -603,30 +603,30 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x22]\n" - "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x21]\n" - "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" "b 44f\n" "43:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -654,15 +654,15 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "45:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -673,10 +673,10 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "b 47f\n" "46:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "47:" // Height 5: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -698,29 +698,29 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "add x24, x24, #0x4\n" "sdot z24.s, z6.b, z4.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n" "add x23, x23, #0x4\n" "sdot z13.s, z7.b, z1.b\n" "sdot z17.s, z7.b, z2.b\n" "add x22, x22, #0x4\n" "sdot z21.s, z7.b, z3.b\n" "sdot z25.s, z7.b, z4.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z22.s, z6.b, z3.b\n" - "sdot z26.s, z6.b, z4.b\n" - "sdot z11.s, z7.b, z0.b\n" + "sdot z10.s, z29.b, z0.b\n" + "sdot z14.s, z29.b, z1.b\n" + "sdot z18.s, z29.b, z2.b\n" + "sdot z22.s, z29.b, z3.b\n" + "sdot z26.s, z29.b, z4.b\n" + "sdot z11.s, z28.b, z0.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "sdot z15.s, z7.b, z1.b\n" - "sdot z19.s, z7.b, z2.b\n" + "sdot z15.s, z28.b, z1.b\n" + "sdot z19.s, z28.b, z2.b\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" - "sdot z23.s, z7.b, z3.b\n" - "sdot z27.s, z7.b, z4.b\n" + "sdot z23.s, z28.b, z3.b\n" + "sdot z27.s, z28.b, z4.b\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1rw { z4.s }, p4/Z, [x22]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -735,50 +735,50 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "cmp x28, x20\n" "sdot z24.s, z6.b, z4.b\n" "sdot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n" "sdot z13.s, z7.b, z1.b\n" "sdot z17.s, z7.b, z2.b\n" "sdot z21.s, z7.b, z3.b\n" "sdot z25.s, z7.b, z4.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b\n" - "sdot z14.s, z6.b, z1.b\n" - "sdot z18.s, z6.b, z2.b\n" - "sdot z22.s, z6.b, z3.b\n" - "sdot z26.s, z6.b, z4.b\n" - "sdot z11.s, z7.b, z0.b\n" - "sdot z15.s, z7.b, z1.b\n" - "sdot z19.s, z7.b, z2.b\n" - "sdot z23.s, z7.b, z3.b\n" - "sdot z27.s, z7.b, z4.b\n" + "sdot z10.s, z29.b, z0.b\n" + "sdot z14.s, z29.b, z1.b\n" + "sdot z18.s, z29.b, z2.b\n" + "sdot z22.s, z29.b, z3.b\n" + "sdot z26.s, z29.b, z4.b\n" + "sdot z11.s, z28.b, z0.b\n" + "sdot z15.s, z28.b, z1.b\n" + "sdot z19.s, z28.b, z2.b\n" + "sdot z23.s, z28.b, z3.b\n" + "sdot z27.s, z28.b, z4.b\n" "bne 45b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "st1w { z8.s }, p3, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "st1w { z8.s }, p3, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p3, [x22]\n" - "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p3, [x21]\n" - "st1w { z25.s }, p2, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p1, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" "50:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -862,16 +862,16 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "55:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 56f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 57f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -883,11 +883,11 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "b 57f\n" "56:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "57:" // Height 6: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -1022,7 +1022,6 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "62:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1030,4 +1029,4 @@ void sve_hybrid_s8s32_dot_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp index b3d2e6b271..eeef192b66 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp @@ -115,11 +115,11 @@ void sve_hybrid_s8s32_dot_6x4VL ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -132,87 +132,87 @@ void sve_hybrid_s8s32_dot_6x4VL ( "8:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10]\n" + "sdot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z10.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z11.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n" + "sdot z8.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z10.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z11.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" "add x26, x26, #0x10\n" "bgt 8b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10]\n" + "sdot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[0]\n" + "sdot z11.s, z16.b, z0.b[0]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[1]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" + "sdot z10.s, z17.b, z0.b[1]\n" + "sdot z11.s, z16.b, z0.b[1]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" "addvl x10, x10, #4\n" "10:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -244,15 +244,15 @@ void sve_hybrid_s8s32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 14f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" "b 15f\n" "14:" // Height 2: no accumulate "mov z8.s, #0x0\n" @@ -268,12 +268,12 @@ void sve_hybrid_s8s32_dot_6x4VL ( "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -281,146 +281,146 @@ void sve_hybrid_s8s32_dot_6x4VL ( "b 18f\n" "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "18:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 20f\n" "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[0]\n" + "sdot z12.s, z17.b, z0.b[0]\n" + "sdot z9.s, z16.b, z1.b[0]\n" + "sdot z13.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[0]\n" + "sdot z14.s, z17.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" "cmp x27, #0x10\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[0]\n" + "sdot z15.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" "add x26, x26, #0x10\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[1]\n" + "sdot z12.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" "add x25, x25, #0x10\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[1]\n" + "sdot z13.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z10.s, z17.b, z1.b[1]\n" + "sdot z14.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[1]\n" + "sdot z15.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[2]\n" + "sdot z12.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[2]\n" + "sdot z13.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[2]\n" + "sdot z14.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z16.b, z1.b[2]\n" + "sdot z15.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z17.b, z1.b[3]\n" + "sdot z12.s, z17.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z16.b, z1.b[3]\n" + "sdot z13.s, z16.b, z0.b[3]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z17.b, z1.b[3]\n" + "sdot z14.s, z17.b, z0.b[3]\n" + "sdot z11.s, z16.b, z1.b[3]\n" + "sdot z15.s, z16.b, z0.b[3]\n" "bgt 19b\n" "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[0]\n" + "sdot z12.s, z17.b, z1.b[0]\n" + "sdot z9.s, z16.b, z0.b[0]\n" + "sdot z13.s, z16.b, z1.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[0]\n" + "sdot z14.s, z17.b, z1.b[0]\n" "addvl x10, x10, #4\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" + "sdot z11.s, z16.b, z0.b[0]\n" + "sdot z15.s, z16.b, z1.b[0]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[1]\n" + "sdot z12.s, z17.b, z1.b[1]\n" + "sdot z9.s, z16.b, z0.b[1]\n" + "sdot z13.s, z16.b, z1.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" + "sdot z10.s, z17.b, z0.b[1]\n" + "sdot z14.s, z17.b, z1.b[1]\n" "addvl x10, x10, #4\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" + "sdot z11.s, z16.b, z0.b[1]\n" + "sdot z15.s, z16.b, z1.b[1]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[2]\n" + "sdot z12.s, z17.b, z1.b[2]\n" + "sdot z9.s, z16.b, z0.b[2]\n" + "sdot z13.s, z16.b, z1.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" + "sdot z10.s, z17.b, z0.b[2]\n" + "sdot z14.s, z17.b, z1.b[2]\n" "addvl x10, x10, #4\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" + "sdot z11.s, z16.b, z0.b[2]\n" + "sdot z15.s, z16.b, z1.b[2]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z17.b, z0.b[3]\n" + "sdot z12.s, z17.b, z1.b[3]\n" + "sdot z9.s, z16.b, z0.b[3]\n" + "sdot z13.s, z16.b, z1.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z17.b, z0.b[3]\n" + "sdot z14.s, z17.b, z1.b[3]\n" "addvl x10, x10, #4\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" + "sdot z11.s, z16.b, z0.b[3]\n" + "sdot z15.s, z16.b, z1.b[3]\n" "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 16b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x20]\n" + "st1w { z13.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x20, #3, MUL VL]\n" "22:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -441,20 +441,20 @@ void sve_hybrid_s8s32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20]\n" + "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n" "b 26f\n" "25:" // Height 3: no accumulate "mov z8.s, #0x0\n" @@ -474,13 +474,13 @@ void sve_hybrid_s8s32_dot_6x4VL ( "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 28f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 29f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -489,86 +489,86 @@ void sve_hybrid_s8s32_dot_6x4VL ( "b 29f\n" "28:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "29:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 31f\n" "30:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x24]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "sdot z8.s, z21.b, z2.b[0]\n" + "sdot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z16.s, z21.b, z0.b[0]\n" + "sdot z9.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[0]\n" + "sdot z17.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "cmp x27, #0x10\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" + "sdot z10.s, z21.b, z2.b[0]\n" + "sdot z14.s, z21.b, z1.b[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" + "sdot z18.s, z21.b, z0.b[0]\n" + "sdot z11.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[0]\n" + "sdot z19.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[1]\n" + "sdot z12.s, z21.b, z1.b[1]\n" + "sdot z16.s, z21.b, z0.b[1]\n" + "sdot z9.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[1]\n" + "sdot z17.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z10.s, z21.b, z2.b[1]\n" + "sdot z14.s, z21.b, z1.b[1]\n" + "sdot z18.s, z21.b, z0.b[1]\n" + "sdot z11.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[1]\n" + "sdot z19.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[2]\n" + "sdot z12.s, z21.b, z1.b[2]\n" + "sdot z16.s, z21.b, z0.b[2]\n" + "sdot z9.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[2]\n" + "sdot z17.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z21.b, z2.b[2]\n" + "sdot z14.s, z21.b, z1.b[2]\n" + "sdot z18.s, z21.b, z0.b[2]\n" + "sdot z11.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n" + "sdot z15.s, z20.b, z1.b[2]\n" + "sdot z19.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z21.b, z2.b[3]\n" + "sdot z12.s, z21.b, z1.b[3]\n" + "sdot z16.s, z21.b, z0.b[3]\n" + "sdot z9.s, z20.b, z2.b[3]\n" + "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[3]\n" + "sdot z17.s, z20.b, z0.b[3]\n" + "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z21.b, z2.b[3]\n" + "sdot z14.s, z21.b, z1.b[3]\n" + "sdot z18.s, z21.b, z0.b[3]\n" + "sdot z11.s, z20.b, z2.b[3]\n" + "sdot z15.s, z20.b, z1.b[3]\n" + "sdot z19.s, z20.b, z0.b[3]\n" "bgt 30b\n" "31:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -576,100 +576,100 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "sdot z8.s, z21.b, z0.b[0]\n" + "sdot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z16.s, z21.b, z2.b[0]\n" + "sdot z9.s, z20.b, z0.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[0]\n" + "sdot z17.s, z20.b, z2.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" + "sdot z10.s, z21.b, z0.b[0]\n" + "sdot z14.s, z21.b, z1.b[0]\n" + "sdot z18.s, z21.b, z2.b[0]\n" + "sdot z11.s, z20.b, z0.b[0]\n" + "sdot z15.s, z20.b, z1.b[0]\n" + "sdot z19.s, z20.b, z2.b[0]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[1]\n" + "sdot z12.s, z21.b, z1.b[1]\n" + "sdot z16.s, z21.b, z2.b[1]\n" + "sdot z9.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[1]\n" + "sdot z17.s, z20.b, z2.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" + "sdot z10.s, z21.b, z0.b[1]\n" + "sdot z14.s, z21.b, z1.b[1]\n" + "sdot z18.s, z21.b, z2.b[1]\n" + "sdot z11.s, z20.b, z0.b[1]\n" + "sdot z15.s, z20.b, z1.b[1]\n" + "sdot z19.s, z20.b, z2.b[1]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[2]\n" + "sdot z12.s, z21.b, z1.b[2]\n" + "sdot z16.s, z21.b, z2.b[2]\n" + "sdot z9.s, z20.b, z0.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[2]\n" + "sdot z17.s, z20.b, z2.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" + "sdot z10.s, z21.b, z0.b[2]\n" + "sdot z14.s, z21.b, z1.b[2]\n" + "sdot z18.s, z21.b, z2.b[2]\n" + "sdot z11.s, z20.b, z0.b[2]\n" + "sdot z15.s, z20.b, z1.b[2]\n" + "sdot z19.s, z20.b, z2.b[2]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z21.b, z0.b[3]\n" + "sdot z12.s, z21.b, z1.b[3]\n" + "sdot z16.s, z21.b, z2.b[3]\n" + "sdot z9.s, z20.b, z0.b[3]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z20.b, z1.b[3]\n" + "sdot z17.s, z20.b, z2.b[3]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" + "sdot z10.s, z21.b, z0.b[3]\n" + "sdot z14.s, z21.b, z1.b[3]\n" + "sdot z18.s, z21.b, z2.b[3]\n" + "sdot z11.s, z20.b, z0.b[3]\n" + "sdot z15.s, z20.b, z1.b[3]\n" + "sdot z19.s, z20.b, z2.b[3]\n" "32:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 27b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x21]\n" + "st1w { z13.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "33:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -690,25 +690,25 @@ void sve_hybrid_s8s32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21]\n" + "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "b 37f\n" "36:" // Height 4: no accumulate "mov z8.s, #0x0\n" @@ -732,14 +732,14 @@ void sve_hybrid_s8s32_dot_6x4VL ( "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 39f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -749,105 +749,105 @@ void sve_hybrid_s8s32_dot_6x4VL ( "b 40f\n" "39:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "40:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 42f\n" "41:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z3.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[0]\n" + "sdot z12.s, z25.b, z2.b[0]\n" + "sdot z16.s, z25.b, z1.b[0]\n" + "sdot z20.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x10\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" + "sdot z9.s, z24.b, z3.b[0]\n" + "sdot z13.s, z24.b, z2.b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "sdot z17.s, z24.b, z1.b[0]\n" + "sdot z21.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[0]\n" + "sdot z14.s, z25.b, z2.b[0]\n" + "sdot z18.s, z25.b, z1.b[0]\n" + "sdot z22.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[0]\n" + "sdot z15.s, z24.b, z2.b[0]\n" + "sdot z19.s, z24.b, z1.b[0]\n" + "sdot z23.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[1]\n" + "sdot z12.s, z25.b, z2.b[1]\n" + "sdot z16.s, z25.b, z1.b[1]\n" + "sdot z20.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[1]\n" + "sdot z13.s, z24.b, z2.b[1]\n" + "sdot z17.s, z24.b, z1.b[1]\n" + "sdot z21.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z10.s, z25.b, z3.b[1]\n" + "sdot z14.s, z25.b, z2.b[1]\n" + "sdot z18.s, z25.b, z1.b[1]\n" + "sdot z22.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[1]\n" + "sdot z15.s, z24.b, z2.b[1]\n" + "sdot z19.s, z24.b, z1.b[1]\n" + "sdot z23.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[2]\n" + "sdot z12.s, z25.b, z2.b[2]\n" + "sdot z16.s, z25.b, z1.b[2]\n" + "sdot z20.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[2]\n" + "sdot z13.s, z24.b, z2.b[2]\n" + "sdot z17.s, z24.b, z1.b[2]\n" + "sdot z21.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[2]\n" + "sdot z14.s, z25.b, z2.b[2]\n" + "sdot z18.s, z25.b, z1.b[2]\n" + "sdot z22.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z24.b, z3.b[2]\n" + "sdot z15.s, z24.b, z2.b[2]\n" + "sdot z19.s, z24.b, z1.b[2]\n" + "sdot z23.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z25.b, z3.b[3]\n" + "sdot z12.s, z25.b, z2.b[3]\n" + "sdot z16.s, z25.b, z1.b[3]\n" + "sdot z20.s, z25.b, z0.b[3]\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z24.b, z3.b[3]\n" + "sdot z13.s, z24.b, z2.b[3]\n" + "sdot z17.s, z24.b, z1.b[3]\n" + "sdot z21.s, z24.b, z0.b[3]\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z25.b, z3.b[3]\n" + "sdot z14.s, z25.b, z2.b[3]\n" + "sdot z18.s, z25.b, z1.b[3]\n" + "sdot z22.s, z25.b, z0.b[3]\n" + "sdot z11.s, z24.b, z3.b[3]\n" + "sdot z15.s, z24.b, z2.b[3]\n" + "sdot z19.s, z24.b, z1.b[3]\n" + "sdot z23.s, z24.b, z0.b[3]\n" "bgt 41b\n" "42:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -856,121 +856,121 @@ void sve_hybrid_s8s32_dot_6x4VL ( "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[0]\n" + "sdot z12.s, z25.b, z1.b[0]\n" + "sdot z16.s, z25.b, z2.b[0]\n" + "sdot z20.s, z25.b, z3.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[0]\n" + "sdot z13.s, z24.b, z1.b[0]\n" + "sdot z17.s, z24.b, z2.b[0]\n" + "sdot z21.s, z24.b, z3.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" + "sdot z10.s, z25.b, z0.b[0]\n" + "sdot z14.s, z25.b, z1.b[0]\n" + "sdot z18.s, z25.b, z2.b[0]\n" + "sdot z22.s, z25.b, z3.b[0]\n" + "sdot z11.s, z24.b, z0.b[0]\n" + "sdot z15.s, z24.b, z1.b[0]\n" + "sdot z19.s, z24.b, z2.b[0]\n" + "sdot z23.s, z24.b, z3.b[0]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[1]\n" + "sdot z12.s, z25.b, z1.b[1]\n" + "sdot z16.s, z25.b, z2.b[1]\n" + "sdot z20.s, z25.b, z3.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[1]\n" + "sdot z13.s, z24.b, z1.b[1]\n" + "sdot z17.s, z24.b, z2.b[1]\n" + "sdot z21.s, z24.b, z3.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" + "sdot z10.s, z25.b, z0.b[1]\n" + "sdot z14.s, z25.b, z1.b[1]\n" + "sdot z18.s, z25.b, z2.b[1]\n" + "sdot z22.s, z25.b, z3.b[1]\n" + "sdot z11.s, z24.b, z0.b[1]\n" + "sdot z15.s, z24.b, z1.b[1]\n" + "sdot z19.s, z24.b, z2.b[1]\n" + "sdot z23.s, z24.b, z3.b[1]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[2]\n" + "sdot z12.s, z25.b, z1.b[2]\n" + "sdot z16.s, z25.b, z2.b[2]\n" + "sdot z20.s, z25.b, z3.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[2]\n" + "sdot z13.s, z24.b, z1.b[2]\n" + "sdot z17.s, z24.b, z2.b[2]\n" + "sdot z21.s, z24.b, z3.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" + "sdot z10.s, z25.b, z0.b[2]\n" + "sdot z14.s, z25.b, z1.b[2]\n" + "sdot z18.s, z25.b, z2.b[2]\n" + "sdot z22.s, z25.b, z3.b[2]\n" + "sdot z11.s, z24.b, z0.b[2]\n" + "sdot z15.s, z24.b, z1.b[2]\n" + "sdot z19.s, z24.b, z2.b[2]\n" + "sdot z23.s, z24.b, z3.b[2]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z25.b, z0.b[3]\n" + "sdot z12.s, z25.b, z1.b[3]\n" + "sdot z16.s, z25.b, z2.b[3]\n" + "sdot z20.s, z25.b, z3.b[3]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z24.b, z0.b[3]\n" + "sdot z13.s, z24.b, z1.b[3]\n" + "sdot z17.s, z24.b, z2.b[3]\n" + "sdot z21.s, z24.b, z3.b[3]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" + "sdot z10.s, z25.b, z0.b[3]\n" + "sdot z14.s, z25.b, z1.b[3]\n" + "sdot z18.s, z25.b, z2.b[3]\n" + "sdot z22.s, z25.b, z3.b[3]\n" + "sdot z11.s, z24.b, z0.b[3]\n" + "sdot z15.s, z24.b, z1.b[3]\n" + "sdot z19.s, z24.b, z2.b[3]\n" + "sdot z23.s, z24.b, z3.b[3]\n" "43:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 38b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p4, [x22]\n" - "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x22]\n" + "st1w { z13.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x20]\n" + "st1w { z21.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x20, #3, MUL VL]\n" "44:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -991,30 +991,30 @@ void sve_hybrid_s8s32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x21]\n" - "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" "b 48f\n" "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -1042,15 +1042,15 @@ void sve_hybrid_s8s32_dot_6x4VL ( "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1061,124 +1061,124 @@ void sve_hybrid_s8s32_dot_6x4VL ( "b 51f\n" "50:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "51:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 53f\n" "52:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x26]\n" + "ld1rqb { z3.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" + "ld1rqb { z0.b }, p0/Z, [x22]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "sdot z8.s, z29.b, z4.b[0]\n" + "sdot z12.s, z29.b, z3.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z16.s, z29.b, z2.b[0]\n" + "sdot z20.s, z29.b, z1.b[0]\n" "add x25, x25, #0x10\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z24.s, z29.b, z0.b[0]\n" + "sdot z9.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" "add x24, x24, #0x10\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" + "sdot z13.s, z28.b, z3.b[0]\n" + "sdot z17.s, z28.b, z2.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "sdot z21.s, z28.b, z1.b[0]\n" + "sdot z25.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[0]\n" + "sdot z14.s, z29.b, z3.b[0]\n" + "sdot z18.s, z29.b, z2.b[0]\n" + "sdot z22.s, z29.b, z1.b[0]\n" + "sdot z26.s, z29.b, z0.b[0]\n" + "sdot z11.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[0]\n" + "sdot z19.s, z28.b, z2.b[0]\n" + "sdot z23.s, z28.b, z1.b[0]\n" + "sdot z27.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[1]\n" + "sdot z12.s, z29.b, z3.b[1]\n" + "sdot z16.s, z29.b, z2.b[1]\n" + "sdot z20.s, z29.b, z1.b[1]\n" + "sdot z24.s, z29.b, z0.b[1]\n" + "sdot z9.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[1]\n" + "sdot z17.s, z28.b, z2.b[1]\n" + "sdot z21.s, z28.b, z1.b[1]\n" + "sdot z25.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z10.s, z29.b, z4.b[1]\n" + "sdot z14.s, z29.b, z3.b[1]\n" + "sdot z18.s, z29.b, z2.b[1]\n" + "sdot z22.s, z29.b, z1.b[1]\n" + "sdot z26.s, z29.b, z0.b[1]\n" + "sdot z11.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[1]\n" + "sdot z19.s, z28.b, z2.b[1]\n" + "sdot z23.s, z28.b, z1.b[1]\n" + "sdot z27.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[2]\n" + "sdot z12.s, z29.b, z3.b[2]\n" + "sdot z16.s, z29.b, z2.b[2]\n" + "sdot z20.s, z29.b, z1.b[2]\n" + "sdot z24.s, z29.b, z0.b[2]\n" + "sdot z9.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[2]\n" + "sdot z17.s, z28.b, z2.b[2]\n" + "sdot z21.s, z28.b, z1.b[2]\n" + "sdot z25.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[2]\n" + "sdot z14.s, z29.b, z3.b[2]\n" + "sdot z18.s, z29.b, z2.b[2]\n" + "sdot z22.s, z29.b, z1.b[2]\n" + "sdot z26.s, z29.b, z0.b[2]\n" + "sdot z11.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n" + "sdot z15.s, z28.b, z3.b[2]\n" + "sdot z19.s, z28.b, z2.b[2]\n" + "sdot z23.s, z28.b, z1.b[2]\n" + "sdot z27.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z29.b, z4.b[3]\n" + "sdot z12.s, z29.b, z3.b[3]\n" + "sdot z16.s, z29.b, z2.b[3]\n" + "sdot z20.s, z29.b, z1.b[3]\n" + "sdot z24.s, z29.b, z0.b[3]\n" + "sdot z9.s, z28.b, z4.b[3]\n" + "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n" + "sdot z13.s, z28.b, z3.b[3]\n" + "sdot z17.s, z28.b, z2.b[3]\n" + "sdot z21.s, z28.b, z1.b[3]\n" + "sdot z25.s, z28.b, z0.b[3]\n" + "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z29.b, z4.b[3]\n" + "sdot z14.s, z29.b, z3.b[3]\n" + "sdot z18.s, z29.b, z2.b[3]\n" + "sdot z22.s, z29.b, z1.b[3]\n" + "sdot z26.s, z29.b, z0.b[3]\n" + "sdot z11.s, z28.b, z4.b[3]\n" + "sdot z15.s, z28.b, z3.b[3]\n" + "sdot z19.s, z28.b, z2.b[3]\n" + "sdot z23.s, z28.b, z1.b[3]\n" + "sdot z27.s, z28.b, z0.b[3]\n" "bgt 52b\n" "53:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1188,142 +1188,142 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "sdot z8.s, z29.b, z0.b[0]\n" + "sdot z12.s, z29.b, z1.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z16.s, z29.b, z2.b[0]\n" + "sdot z20.s, z29.b, z3.b[0]\n" + "sdot z24.s, z29.b, z4.b[0]\n" + "sdot z9.s, z28.b, z0.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[0]\n" + "sdot z17.s, z28.b, z2.b[0]\n" + "sdot z21.s, z28.b, z3.b[0]\n" + "sdot z25.s, z28.b, z4.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" + "sdot z10.s, z29.b, z0.b[0]\n" + "sdot z14.s, z29.b, z1.b[0]\n" + "sdot z18.s, z29.b, z2.b[0]\n" + "sdot z22.s, z29.b, z3.b[0]\n" + "sdot z26.s, z29.b, z4.b[0]\n" + "sdot z11.s, z28.b, z0.b[0]\n" + "sdot z15.s, z28.b, z1.b[0]\n" + "sdot z19.s, z28.b, z2.b[0]\n" + "sdot z23.s, z28.b, z3.b[0]\n" + "sdot z27.s, z28.b, z4.b[0]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[1]\n" + "sdot z12.s, z29.b, z1.b[1]\n" + "sdot z16.s, z29.b, z2.b[1]\n" + "sdot z20.s, z29.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z24.s, z29.b, z4.b[1]\n" + "sdot z9.s, z28.b, z0.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[1]\n" + "sdot z17.s, z28.b, z2.b[1]\n" + "sdot z21.s, z28.b, z3.b[1]\n" + "sdot z25.s, z28.b, z4.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" + "sdot z10.s, z29.b, z0.b[1]\n" + "sdot z14.s, z29.b, z1.b[1]\n" + "sdot z18.s, z29.b, z2.b[1]\n" + "sdot z22.s, z29.b, z3.b[1]\n" + "sdot z26.s, z29.b, z4.b[1]\n" + "sdot z11.s, z28.b, z0.b[1]\n" + "sdot z15.s, z28.b, z1.b[1]\n" + "sdot z19.s, z28.b, z2.b[1]\n" + "sdot z23.s, z28.b, z3.b[1]\n" + "sdot z27.s, z28.b, z4.b[1]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[2]\n" + "sdot z12.s, z29.b, z1.b[2]\n" + "sdot z16.s, z29.b, z2.b[2]\n" + "sdot z20.s, z29.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z24.s, z29.b, z4.b[2]\n" + "sdot z9.s, z28.b, z0.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[2]\n" + "sdot z17.s, z28.b, z2.b[2]\n" + "sdot z21.s, z28.b, z3.b[2]\n" + "sdot z25.s, z28.b, z4.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" + "sdot z10.s, z29.b, z0.b[2]\n" + "sdot z14.s, z29.b, z1.b[2]\n" + "sdot z18.s, z29.b, z2.b[2]\n" + "sdot z22.s, z29.b, z3.b[2]\n" + "sdot z26.s, z29.b, z4.b[2]\n" + "sdot z11.s, z28.b, z0.b[2]\n" + "sdot z15.s, z28.b, z1.b[2]\n" + "sdot z19.s, z28.b, z2.b[2]\n" + "sdot z23.s, z28.b, z3.b[2]\n" + "sdot z27.s, z28.b, z4.b[2]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z29.b, z0.b[3]\n" + "sdot z12.s, z29.b, z1.b[3]\n" + "sdot z16.s, z29.b, z2.b[3]\n" + "sdot z20.s, z29.b, z3.b[3]\n" + "sdot z24.s, z29.b, z4.b[3]\n" + "sdot z9.s, z28.b, z0.b[3]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z13.s, z28.b, z1.b[3]\n" + "sdot z17.s, z28.b, z2.b[3]\n" + "sdot z21.s, z28.b, z3.b[3]\n" + "sdot z25.s, z28.b, z4.b[3]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" + "sdot z10.s, z29.b, z0.b[3]\n" + "sdot z14.s, z29.b, z1.b[3]\n" + "sdot z18.s, z29.b, z2.b[3]\n" + "sdot z22.s, z29.b, z3.b[3]\n" + "sdot z26.s, z29.b, z4.b[3]\n" + "sdot z11.s, z28.b, z0.b[3]\n" + "sdot z15.s, z28.b, z1.b[3]\n" + "sdot z19.s, z28.b, z2.b[3]\n" + "sdot z23.s, z28.b, z3.b[3]\n" + "sdot z27.s, z28.b, z4.b[3]\n" "54:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 49b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "st1w { z8.s }, p4, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "st1w { z8.s }, p4, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p4, [x22]\n" - "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p4, [x21]\n" - "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" "55:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -1407,16 +1407,16 @@ void sve_hybrid_s8s32_dot_6x4VL ( "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1428,143 +1428,143 @@ void sve_hybrid_s8s32_dot_6x4VL ( "b 62f\n" "61:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "62:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 64f\n" "63:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z6.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[0]\n" + "sdot z12.s, z1.b, z6.b[0]\n" + "sdot z16.s, z1.b, z5.b[0]\n" + "sdot z20.s, z1.b, z4.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z24.s, z1.b, z3.b[0]\n" + "sdot z28.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "add x21, x21, #0x10\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "sdot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[0]\n" + "sdot z13.s, z0.b, z6.b[0]\n" + "sdot z17.s, z0.b, z5.b[0]\n" + "sdot z21.s, z0.b, z4.b[0]\n" + "sdot z25.s, z0.b, z3.b[0]\n" + "sdot z29.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[0]\n" + "sdot z14.s, z1.b, z6.b[0]\n" + "sdot z18.s, z1.b, z5.b[0]\n" + "sdot z22.s, z1.b, z4.b[0]\n" + "sdot z26.s, z1.b, z3.b[0]\n" + "sdot z30.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[0]\n" + "sdot z15.s, z0.b, z6.b[0]\n" + "sdot z19.s, z0.b, z5.b[0]\n" + "sdot z23.s, z0.b, z4.b[0]\n" + "sdot z27.s, z0.b, z3.b[0]\n" + "sdot z31.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[1]\n" + "sdot z12.s, z1.b, z6.b[1]\n" + "sdot z16.s, z1.b, z5.b[1]\n" + "sdot z20.s, z1.b, z4.b[1]\n" + "sdot z24.s, z1.b, z3.b[1]\n" + "sdot z28.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[1]\n" + "sdot z13.s, z0.b, z6.b[1]\n" + "sdot z17.s, z0.b, z5.b[1]\n" + "sdot z21.s, z0.b, z4.b[1]\n" + "sdot z25.s, z0.b, z3.b[1]\n" + "sdot z29.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "sdot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "sdot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z30.s, z6.b, z5.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" - "sdot z31.s, z7.b, z5.b[3]\n" + "sdot z10.s, z1.b, z7.b[1]\n" + "sdot z14.s, z1.b, z6.b[1]\n" + "sdot z18.s, z1.b, z5.b[1]\n" + "sdot z22.s, z1.b, z4.b[1]\n" + "sdot z26.s, z1.b, z3.b[1]\n" + "sdot z30.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[1]\n" + "sdot z15.s, z0.b, z6.b[1]\n" + "sdot z19.s, z0.b, z5.b[1]\n" + "sdot z23.s, z0.b, z4.b[1]\n" + "sdot z27.s, z0.b, z3.b[1]\n" + "sdot z31.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[2]\n" + "sdot z12.s, z1.b, z6.b[2]\n" + "sdot z16.s, z1.b, z5.b[2]\n" + "sdot z20.s, z1.b, z4.b[2]\n" + "sdot z24.s, z1.b, z3.b[2]\n" + "sdot z28.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[2]\n" + "sdot z13.s, z0.b, z6.b[2]\n" + "sdot z17.s, z0.b, z5.b[2]\n" + "sdot z21.s, z0.b, z4.b[2]\n" + "sdot z25.s, z0.b, z3.b[2]\n" + "sdot z29.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[2]\n" + "sdot z14.s, z1.b, z6.b[2]\n" + "sdot z18.s, z1.b, z5.b[2]\n" + "sdot z22.s, z1.b, z4.b[2]\n" + "sdot z26.s, z1.b, z3.b[2]\n" + "sdot z30.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + "sdot z11.s, z0.b, z7.b[2]\n" + "sdot z15.s, z0.b, z6.b[2]\n" + "sdot z19.s, z0.b, z5.b[2]\n" + "sdot z23.s, z0.b, z4.b[2]\n" + "sdot z27.s, z0.b, z3.b[2]\n" + "sdot z31.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + "sdot z8.s, z1.b, z7.b[3]\n" + "sdot z12.s, z1.b, z6.b[3]\n" + "sdot z16.s, z1.b, z5.b[3]\n" + "sdot z20.s, z1.b, z4.b[3]\n" + "sdot z24.s, z1.b, z3.b[3]\n" + "sdot z28.s, z1.b, z2.b[3]\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + "sdot z9.s, z0.b, z7.b[3]\n" + "sdot z13.s, z0.b, z6.b[3]\n" + "sdot z17.s, z0.b, z5.b[3]\n" + "sdot z21.s, z0.b, z4.b[3]\n" + "sdot z25.s, z0.b, z3.b[3]\n" + "sdot z29.s, z0.b, z2.b[3]\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + "sdot z10.s, z1.b, z7.b[3]\n" + "sdot z14.s, z1.b, z6.b[3]\n" + "sdot z18.s, z1.b, z5.b[3]\n" + "sdot z22.s, z1.b, z4.b[3]\n" + "sdot z26.s, z1.b, z3.b[3]\n" + "sdot z30.s, z1.b, z2.b[3]\n" + "sdot z11.s, z0.b, z7.b[3]\n" + "sdot z15.s, z0.b, z6.b[3]\n" + "sdot z19.s, z0.b, z5.b[3]\n" + "sdot z23.s, z0.b, z4.b[3]\n" + "sdot z27.s, z0.b, z3.b[3]\n" + "sdot z31.s, z0.b, z2.b[3]\n" "bgt 63b\n" "64:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1575,127 +1575,127 @@ void sve_hybrid_s8s32_dot_6x4VL ( "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" "ld1rqb { z5.b }, p0/Z, [x21]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[0]\n" - "sdot z12.s, z6.b, z1.b[0]\n" - "sdot z16.s, z6.b, z2.b[0]\n" - "sdot z20.s, z6.b, z3.b[0]\n" - "sdot z24.s, z6.b, z4.b[0]\n" - "sdot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[0]\n" - "sdot z13.s, z7.b, z1.b[0]\n" - "sdot z17.s, z7.b, z2.b[0]\n" - "sdot z21.s, z7.b, z3.b[0]\n" - "sdot z25.s, z7.b, z4.b[0]\n" - "sdot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[0]\n" + "sdot z12.s, z7.b, z1.b[0]\n" + "sdot z16.s, z7.b, z2.b[0]\n" + "sdot z20.s, z7.b, z3.b[0]\n" + "sdot z24.s, z7.b, z4.b[0]\n" + "sdot z28.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[0]\n" + "sdot z13.s, z6.b, z1.b[0]\n" + "sdot z17.s, z6.b, z2.b[0]\n" + "sdot z21.s, z6.b, z3.b[0]\n" + "sdot z25.s, z6.b, z4.b[0]\n" + "sdot z29.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[0]\n" - "sdot z14.s, z6.b, z1.b[0]\n" - "sdot z18.s, z6.b, z2.b[0]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z26.s, z6.b, z4.b[0]\n" - "sdot z30.s, z6.b, z5.b[0]\n" - "sdot z11.s, z7.b, z0.b[0]\n" - "sdot z15.s, z7.b, z1.b[0]\n" - "sdot z19.s, z7.b, z2.b[0]\n" - "sdot z23.s, z7.b, z3.b[0]\n" - "sdot z27.s, z7.b, z4.b[0]\n" - "sdot z31.s, z7.b, z5.b[0]\n" + "sdot z10.s, z7.b, z0.b[0]\n" + "sdot z14.s, z7.b, z1.b[0]\n" + "sdot z18.s, z7.b, z2.b[0]\n" + "sdot z22.s, z7.b, z3.b[0]\n" + "sdot z26.s, z7.b, z4.b[0]\n" + "sdot z30.s, z7.b, z5.b[0]\n" + "sdot z11.s, z6.b, z0.b[0]\n" + "sdot z15.s, z6.b, z1.b[0]\n" + "sdot z19.s, z6.b, z2.b[0]\n" + "sdot z23.s, z6.b, z3.b[0]\n" + "sdot z27.s, z6.b, z4.b[0]\n" + "sdot z31.s, z6.b, z5.b[0]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[1]\n" - "sdot z12.s, z6.b, z1.b[1]\n" - "sdot z16.s, z6.b, z2.b[1]\n" - "sdot z20.s, z6.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[1]\n" + "sdot z12.s, z7.b, z1.b[1]\n" + "sdot z16.s, z7.b, z2.b[1]\n" + "sdot z20.s, z7.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[1]\n" - "sdot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[1]\n" - "sdot z13.s, z7.b, z1.b[1]\n" - "sdot z17.s, z7.b, z2.b[1]\n" - "sdot z21.s, z7.b, z3.b[1]\n" - "sdot z25.s, z7.b, z4.b[1]\n" - "sdot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z24.s, z7.b, z4.b[1]\n" + "sdot z28.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[1]\n" + "sdot z13.s, z6.b, z1.b[1]\n" + "sdot z17.s, z6.b, z2.b[1]\n" + "sdot z21.s, z6.b, z3.b[1]\n" + "sdot z25.s, z6.b, z4.b[1]\n" + "sdot z29.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[1]\n" - "sdot z14.s, z6.b, z1.b[1]\n" - "sdot z18.s, z6.b, z2.b[1]\n" - "sdot z22.s, z6.b, z3.b[1]\n" - "sdot z26.s, z6.b, z4.b[1]\n" - "sdot z30.s, z6.b, z5.b[1]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z15.s, z7.b, z1.b[1]\n" - "sdot z19.s, z7.b, z2.b[1]\n" - "sdot z23.s, z7.b, z3.b[1]\n" - "sdot z27.s, z7.b, z4.b[1]\n" - "sdot z31.s, z7.b, z5.b[1]\n" + "sdot z10.s, z7.b, z0.b[1]\n" + "sdot z14.s, z7.b, z1.b[1]\n" + "sdot z18.s, z7.b, z2.b[1]\n" + "sdot z22.s, z7.b, z3.b[1]\n" + "sdot z26.s, z7.b, z4.b[1]\n" + "sdot z30.s, z7.b, z5.b[1]\n" + "sdot z11.s, z6.b, z0.b[1]\n" + "sdot z15.s, z6.b, z1.b[1]\n" + "sdot z19.s, z6.b, z2.b[1]\n" + "sdot z23.s, z6.b, z3.b[1]\n" + "sdot z27.s, z6.b, z4.b[1]\n" + "sdot z31.s, z6.b, z5.b[1]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[2]\n" - "sdot z12.s, z6.b, z1.b[2]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z20.s, z6.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[2]\n" + "sdot z12.s, z7.b, z1.b[2]\n" + "sdot z16.s, z7.b, z2.b[2]\n" + "sdot z20.s, z7.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "sdot z24.s, z6.b, z4.b[2]\n" - "sdot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[2]\n" - "sdot z13.s, z7.b, z1.b[2]\n" - "sdot z17.s, z7.b, z2.b[2]\n" - "sdot z21.s, z7.b, z3.b[2]\n" - "sdot z25.s, z7.b, z4.b[2]\n" - "sdot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "sdot z24.s, z7.b, z4.b[2]\n" + "sdot z28.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[2]\n" + "sdot z13.s, z6.b, z1.b[2]\n" + "sdot z17.s, z6.b, z2.b[2]\n" + "sdot z21.s, z6.b, z3.b[2]\n" + "sdot z25.s, z6.b, z4.b[2]\n" + "sdot z29.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[2]\n" - "sdot z14.s, z6.b, z1.b[2]\n" - "sdot z18.s, z6.b, z2.b[2]\n" - "sdot z22.s, z6.b, z3.b[2]\n" - "sdot z26.s, z6.b, z4.b[2]\n" - "sdot z30.s, z6.b, z5.b[2]\n" - "sdot z11.s, z7.b, z0.b[2]\n" - "sdot z15.s, z7.b, z1.b[2]\n" - "sdot z19.s, z7.b, z2.b[2]\n" - "sdot z23.s, z7.b, z3.b[2]\n" - "sdot z27.s, z7.b, z4.b[2]\n" - "sdot z31.s, z7.b, z5.b[2]\n" + "sdot z10.s, z7.b, z0.b[2]\n" + "sdot z14.s, z7.b, z1.b[2]\n" + "sdot z18.s, z7.b, z2.b[2]\n" + "sdot z22.s, z7.b, z3.b[2]\n" + "sdot z26.s, z7.b, z4.b[2]\n" + "sdot z30.s, z7.b, z5.b[2]\n" + "sdot z11.s, z6.b, z0.b[2]\n" + "sdot z15.s, z6.b, z1.b[2]\n" + "sdot z19.s, z6.b, z2.b[2]\n" + "sdot z23.s, z6.b, z3.b[2]\n" + "sdot z27.s, z6.b, z4.b[2]\n" + "sdot z31.s, z6.b, z5.b[2]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "sdot z8.s, z6.b, z0.b[3]\n" - "sdot z12.s, z6.b, z1.b[3]\n" - "sdot z16.s, z6.b, z2.b[3]\n" - "sdot z20.s, z6.b, z3.b[3]\n" - "sdot z24.s, z6.b, z4.b[3]\n" - "sdot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "sdot z9.s, z7.b, z0.b[3]\n" - "sdot z13.s, z7.b, z1.b[3]\n" - "sdot z17.s, z7.b, z2.b[3]\n" - "sdot z21.s, z7.b, z3.b[3]\n" - "sdot z25.s, z7.b, z4.b[3]\n" - "sdot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "sdot z8.s, z7.b, z0.b[3]\n" + "sdot z12.s, z7.b, z1.b[3]\n" + "sdot z16.s, z7.b, z2.b[3]\n" + "sdot z20.s, z7.b, z3.b[3]\n" + "sdot z24.s, z7.b, z4.b[3]\n" + "sdot z28.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "sdot z9.s, z6.b, z0.b[3]\n" + "sdot z13.s, z6.b, z1.b[3]\n" + "sdot z17.s, z6.b, z2.b[3]\n" + "sdot z21.s, z6.b, z3.b[3]\n" + "sdot z25.s, z6.b, z4.b[3]\n" + "sdot z29.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "sdot z10.s, z6.b, z0.b[3]\n" - "sdot z14.s, z6.b, z1.b[3]\n" - "sdot z18.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[3]\n" - "sdot z26.s, z6.b, z4.b[3]\n" - "sdot z30.s, z6.b, z5.b[3]\n" - "sdot z11.s, z7.b, z0.b[3]\n" - "sdot z15.s, z7.b, z1.b[3]\n" - "sdot z19.s, z7.b, z2.b[3]\n" - "sdot z23.s, z7.b, z3.b[3]\n" - "sdot z27.s, z7.b, z4.b[3]\n" - "sdot z31.s, z7.b, z5.b[3]\n" + "sdot z10.s, z7.b, z0.b[3]\n" + "sdot z14.s, z7.b, z1.b[3]\n" + "sdot z18.s, z7.b, z2.b[3]\n" + "sdot z22.s, z7.b, z3.b[3]\n" + "sdot z26.s, z7.b, z4.b[3]\n" + "sdot z30.s, z7.b, z5.b[3]\n" + "sdot z11.s, z6.b, z0.b[3]\n" + "sdot z15.s, z6.b, z1.b[3]\n" + "sdot z19.s, z6.b, z2.b[3]\n" + "sdot z23.s, z6.b, z3.b[3]\n" + "sdot z27.s, z6.b, z4.b[3]\n" + "sdot z31.s, z6.b, z5.b[3]\n" "65:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1748,7 +1748,6 @@ void sve_hybrid_s8s32_dot_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "68:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1756,4 +1755,4 @@ void sve_hybrid_s8s32_dot_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp index c08977570e..686295496e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -86,7 +85,6 @@ public: } } - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -111,5 +109,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp index 350425647a..f66b6345ea 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp @@ -100,16 +100,16 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "incw x20\n" "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 3f\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 4f\n" @@ -127,11 +127,11 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -143,86 +143,86 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "ble 9f\n" "8:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n" + ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n" + ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n" + "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45109a8a // smmla z10.s, z20.b, z16.b\n" + ".inst 0x45079a8e // smmla z14.s, z20.b, z7.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n" + ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "bgt 8b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" "addvl x10, x10, #8\n" "ble 10f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" + ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" + ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" + ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" + ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" "addvl x10, x10, #8\n" "10:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -258,21 +258,21 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 14f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x9, x20, LSL #2\n" + "ld1w { z18.s }, p4/Z, [x9]\n" + "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "zip1 z8.d, z18.d, z12.d\n" + "zip2 z12.d, z18.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z2.d, z13.d\n" + "zip2 z13.d, z2.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 15f\n" @@ -290,12 +290,12 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -303,95 +303,95 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "b 18f\n" "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "18:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 20f\n" "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45119a88 // smmla z8.s, z20.b, z17.b\n" + ".inst 0x45109a8c // smmla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45119a89 // smmla z9.s, z20.b, z17.b\n" + ".inst 0x45109a8d // smmla z13.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45119a8a // smmla z10.s, z20.b, z17.b\n" + ".inst 0x45109a8e // smmla z14.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + ".inst 0x45119a8b // smmla z11.s, z20.b, z17.b\n" + ".inst 0x45109a8f // smmla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "bgt 19b\n" "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119a48 // smmla z8.s, z18.b, z17.b\n" + ".inst 0x45109a4c // smmla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119a49 // smmla z9.s, z18.b, z17.b\n" + ".inst 0x45109a4d // smmla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45119a4a // smmla z10.s, z18.b, z17.b\n" + ".inst 0x45109a4e // smmla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45119a4b // smmla z11.s, z18.b, z17.b\n" + ".inst 0x45109a4f // smmla z15.s, z18.b, z16.b\n" "addvl x10, x10, #8\n" "ble 21f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45119828 // smmla z8.s, z1.b, z17.b\n" + ".inst 0x4510982c // smmla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45119829 // smmla z9.s, z1.b, z17.b\n" + ".inst 0x4510982d // smmla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4511982a // smmla z10.s, z1.b, z17.b\n" + ".inst 0x4510982e // smmla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x4511982b // smmla z11.s, z1.b, z17.b\n" + ".inst 0x4510982f // smmla z15.s, z1.b, z16.b\n" "addvl x10, x10, #8\n" "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -399,24 +399,24 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "cmp x28, x20\n" "bne 16b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x20, x9, x20, LSL #2\n" + "uzp1 z16.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" + "uzp1 z17.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" - "uzp1 z13.d, z10.d, z14.d\n" + "st1w { z16.s }, p4, [x9]\n" + "uzp1 z16.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" - "uzp1 z14.d, z11.d, z15.d\n" + "st1w { z17.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z2.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "st1w { z16.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z2.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z8.s }, p4, [x24]\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z8.s }, p4, [x20]\n" + "st1w { z9.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x20, #3, MUL VL]\n" "22:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -437,28 +437,28 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x20]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" @@ -490,13 +490,13 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 28f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 29f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -505,169 +505,169 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "b 29f\n" "28:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "29:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 31f\n" "30:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "trn1 z27.d, z30.d, z24.d\n" + "trn2 z30.d, z30.d, z24.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "trn1 z26.d, z28.d, z29.d\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z29.d\n" + ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n" + ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n" + ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n" + ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n" + ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n" + ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n" + ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n" + ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n" + ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n" + ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n" "bgt 30b\n" "31:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn1 z27.d, z1.d, z24.d\n" + "trn2 z1.d, z1.d, z24.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "trn1 z26.d, z3.d, z28.d\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199b68 // smmla z8.s, z27.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b6c // smmla z12.s, z27.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199b69 // smmla z9.s, z27.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45189b6d // smmla z13.s, z27.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z28.d\n" + ".inst 0x45199b6a // smmla z10.s, z27.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45189b6e // smmla z14.s, z27.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x45199b6b // smmla z11.s, z27.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + ".inst 0x45189b6f // smmla z15.s, z27.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" "ble 32f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" + ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" + ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" + ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" + ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" + ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" + ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" + ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" + ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" + ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" + ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" "32:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 27b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "uzp1 z25.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z24.d, z9.d, z13.d\n" + "st1w { z25.s }, p4, [x9]\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z25.d, z10.d, z14.d\n" + "st1w { z24.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z24.d, z11.d, z15.d\n" + "st1w { z25.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" "uzp1 z16.d, z16.d, z20.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "st1w { z24.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp1 z17.d, z17.d, z21.d\n" "uzp1 z18.d, z18.d, z22.d\n" - "st1w { z8.s }, p4, [x24]\n" + "st1w { z8.s }, p4, [x21]\n" "uzp1 z19.d, z19.d, z23.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z9.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "33:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -688,37 +688,37 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x21]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" @@ -746,14 +746,14 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 39f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -763,182 +763,182 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "b 40f\n" "39:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "40:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 42f\n" "41:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z29.d, z30.d, z24.d\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z30.d, z30.d, z24.d\n" + "trn1 z26.d, z28.d, z27.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199ba8 // smmla z8.s, z29.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189bac // smmla z12.s, z29.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199ba9 // smmla z9.s, z29.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z27.d\n" + ".inst 0x45189bad // smmla z13.s, z29.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45199baa // smmla z10.s, z29.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45189bae // smmla z14.s, z29.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45199bab // smmla z11.s, z29.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45189baf // smmla z15.s, z29.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45199bc8 // smmla z8.s, z30.b, z25.b\n" + ".inst 0x45199b90 // smmla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45189bcc // smmla z12.s, z30.b, z24.b\n" + ".inst 0x45189b94 // smmla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x45199bc9 // smmla z9.s, z30.b, z25.b\n" + ".inst 0x45199b91 // smmla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45189bcd // smmla z13.s, z30.b, z24.b\n" + ".inst 0x45189b95 // smmla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45199bca // smmla z10.s, z30.b, z25.b\n" + ".inst 0x45199b92 // smmla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45189bce // smmla z14.s, z30.b, z24.b\n" + ".inst 0x45189b96 // smmla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45199bcb // smmla z11.s, z30.b, z25.b\n" + ".inst 0x45199b93 // smmla z19.s, z28.b, z25.b\n" + ".inst 0x45189bcf // smmla z15.s, z30.b, z24.b\n" + ".inst 0x45189b97 // smmla z23.s, z28.b, z24.b\n" "bgt 41b\n" "42:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z28.d, z1.d, z24.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z24.d\n" + "trn1 z26.d, z3.d, z27.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199b88 // smmla z8.s, z28.b, z25.b\n" + ".inst 0x45199b50 // smmla z16.s, z26.b, z25.b\n" + ".inst 0x45189b8c // smmla z12.s, z28.b, z24.b\n" + ".inst 0x45189b54 // smmla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199b89 // smmla z9.s, z28.b, z25.b\n" + ".inst 0x45199b51 // smmla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45189b8d // smmla z13.s, z28.b, z24.b\n" + ".inst 0x45189b55 // smmla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z27.d\n" + ".inst 0x45199b8a // smmla z10.s, z28.b, z25.b\n" + ".inst 0x45199b52 // smmla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45189b8e // smmla z14.s, z28.b, z24.b\n" + ".inst 0x45189b56 // smmla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" + ".inst 0x45199b8b // smmla z11.s, z28.b, z25.b\n" + ".inst 0x45199b53 // smmla z19.s, z26.b, z25.b\n" + ".inst 0x45189b8f // smmla z15.s, z28.b, z24.b\n" + ".inst 0x45189b57 // smmla z23.s, z26.b, z24.b\n" "ble 43f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45199828 // smmla z8.s, z1.b, z25.b\n" + ".inst 0x45199870 // smmla z16.s, z3.b, z25.b\n" + ".inst 0x4518982c // smmla z12.s, z1.b, z24.b\n" + ".inst 0x45189874 // smmla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45199829 // smmla z9.s, z1.b, z25.b\n" + ".inst 0x45199871 // smmla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x4518982d // smmla z13.s, z1.b, z24.b\n" + ".inst 0x45189875 // smmla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4519982a // smmla z10.s, z1.b, z25.b\n" + ".inst 0x45199872 // smmla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x4518982e // smmla z14.s, z1.b, z24.b\n" + ".inst 0x45189876 // smmla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" + ".inst 0x4519982b // smmla z11.s, z1.b, z25.b\n" + ".inst 0x45199873 // smmla z19.s, z3.b, z25.b\n" + ".inst 0x4518982f // smmla z15.s, z1.b, z24.b\n" + ".inst 0x45189877 // smmla z23.s, z3.b, z24.b\n" "43:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 38b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" - "add x22, x23, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "uzp1 z25.d, z8.d, z12.d\n" + "add x20, x21, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z24.d, z9.d, z13.d\n" + "st1w { z25.s }, p4, [x9]\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z25.d, z10.d, z14.d\n" + "st1w { z24.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z24.d, z11.d, z15.d\n" + "st1w { z25.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" - "uzp1 z15.d, z16.d, z20.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "uzp1 z25.d, z16.d, z20.d\n" + "st1w { z24.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp2 z16.d, z16.d, z20.d\n" - "uzp1 z20.d, z17.d, z21.d\n" - "st1w { z8.s }, p4, [x24]\n" + "uzp1 z24.d, z17.d, z21.d\n" + "st1w { z8.s }, p4, [x22]\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z9.s }, p3, [x22, #1, MUL VL]\n" "uzp2 z18.d, z18.d, z22.d\n" - "uzp1 z22.d, z19.d, z23.d\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "uzp1 z20.d, z19.d, z23.d\n" + "st1w { z10.s }, p2, [x22, #2, MUL VL]\n" "uzp2 z19.d, z19.d, z23.d\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z15.s }, p4, [x23]\n" - "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x22]\n" - "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z11.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z25.s }, p4, [x21]\n" + "st1w { z24.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "44:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -959,54 +959,54 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x21]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x20]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z19.d, z24.d, z23.d\n" "zip2 z23.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z24.d, z25.d, z28.d\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 48f\n" "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -1038,15 +1038,15 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1057,231 +1057,231 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "b 51f\n" "50:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "51:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 53f\n" "52:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z6.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z5.d, z6.d, z1.d\n" + "trn2 z6.d, z6.d, z1.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "trn1 z3.d, z7.d, z2.d\n" + "trn2 z7.d, z7.d, z2.d\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "trn1 z2.d, z4.d, z0.d\n" + "trn2 z4.d, z4.d, z0.d\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x450198a8 // smmla z8.s, z5.b, z1.b\n" + ".inst 0x45019870 // smmla z16.s, z3.b, z1.b\n" + ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x450098ac // smmla z12.s, z5.b, z0.b\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x450198a9 // smmla z9.s, z5.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45019871 // smmla z17.s, z3.b, z1.b\n" + ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x450098ad // smmla z13.s, z5.b, z0.b\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x450198aa // smmla z10.s, z5.b, z1.b\n" + ".inst 0x45019872 // smmla z18.s, z3.b, z1.b\n" + ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x450098ae // smmla z14.s, z5.b, z0.b\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x450198ab // smmla z11.s, z5.b, z1.b\n" + ".inst 0x45019873 // smmla z19.s, z3.b, z1.b\n" + ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x450098af // smmla z15.s, z5.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" + ".inst 0x450198f0 // smmla z16.s, z7.b, z1.b\n" + ".inst 0x45019898 // smmla z24.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" + ".inst 0x450098f4 // smmla z20.s, z7.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" + ".inst 0x450198f1 // smmla z17.s, z7.b, z1.b\n" + ".inst 0x45019899 // smmla z25.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" + ".inst 0x450098f5 // smmla z21.s, z7.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" + ".inst 0x450198f2 // smmla z18.s, z7.b, z1.b\n" + ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" + ".inst 0x450098f6 // smmla z22.s, z7.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" + ".inst 0x450198f3 // smmla z19.s, z7.b, z1.b\n" + ".inst 0x4501989b // smmla z27.s, z4.b, z1.b\n" + ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" + ".inst 0x450098f7 // smmla z23.s, z7.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "bgt 52b\n" "53:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z7.d, z1.d, z4.d\n" + "trn2 z1.d, z1.d, z4.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "trn1 z6.d, z3.d, z2.d\n" + "trn2 z3.d, z3.d, z2.d\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" + ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" + ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" + ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" + ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" + ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" + ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" "addvl x10, x10, #8\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" + ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "ble 54f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" + ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" + ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" + ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" + ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" + ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" + ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" + ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" + ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" + ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" + ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" + ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" + ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" + ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" + ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" "54:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 49b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "uzp1 z2.d, z8.d, z12.d\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" + "uzp1 z1.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z0.d, z10.d, z14.d\n" + "st1w { z2.s }, p4, [x9]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z2.d, z11.d, z15.d\n" + "st1w { z1.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" - "uzp1 z15.d, z16.d, z20.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z1.d, z16.d, z20.d\n" + "st1w { z0.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z16.d, z16.d, z20.d\n" - "uzp1 z20.d, z17.d, z21.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "uzp1 z0.d, z17.d, z21.d\n" + "st1w { z2.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" - "st1w { z8.s }, p4, [x24]\n" + "st1w { z8.s }, p4, [x23]\n" "uzp2 z18.d, z18.d, z22.d\n" - "uzp1 z22.d, z19.d, z23.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "uzp1 z20.d, z19.d, z23.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" "uzp2 z19.d, z19.d, z23.d\n" "uzp1 z24.d, z24.d, z28.d\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" "uzp1 z27.d, z27.d, z31.d\n" - "st1w { z15.s }, p4, [x23]\n" - "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x22]\n" - "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p4, [x21]\n" - "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z1.s }, p4, [x22]\n" + "st1w { z0.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" "55:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -1307,26 +1307,26 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n" "add x20, x21, x20, LSL #2\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" + "zip1 z8.d, z17.d, z12.d\n" "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip2 z12.d, z9.d, z12.d\n" - "zip1 z9.d, z10.d, z13.d\n" + "zip2 z12.d, z17.d, z12.d\n" + "zip1 z9.d, z18.d, z13.d\n" "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" "ld1w { z17.s }, p4/Z, [x23]\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z20.d, z14.d\n" "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip2 z14.d, z20.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" "ld1w { z20.s }, p4/Z, [x22]\n" @@ -1344,7 +1344,7 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n" "ld1w { z28.s }, p4/Z, [x20]\n" "zip2 z23.d, z24.d, z23.d\n" "zip1 z24.d, z25.d, z28.d\n" @@ -1356,8 +1356,8 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 59f\n" "58:" // Height 6: no accumulate "mov z8.s, #0x0\n" @@ -1389,16 +1389,16 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1410,184 +1410,184 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "b 62f\n" "61:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "62:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 64f\n" "63:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z6.d, z7.d, z0.d\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn2 z7.d, z7.d, z0.d\n" + "trn1 z4.d, z5.d, z1.d\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z5.d, z5.d, z1.d\n" + "trn1 z2.d, z3.d, z0.d\n" + "trn2 z3.d, z3.d, z0.d\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x450198c8 // smmla z8.s, z6.b, z1.b\n" + ".inst 0x45019890 // smmla z16.s, z4.b, z1.b\n" + ".inst 0x45019858 // smmla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" + ".inst 0x450098cc // smmla z12.s, z6.b, z0.b\n" + ".inst 0x45009894 // smmla z20.s, z4.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x4500985c // smmla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x450198c9 // smmla z9.s, z6.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45019891 // smmla z17.s, z4.b, z1.b\n" + ".inst 0x45019859 // smmla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" + ".inst 0x450098cd // smmla z13.s, z6.b, z0.b\n" + ".inst 0x45009895 // smmla z21.s, z4.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" + ".inst 0x4500985d // smmla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x450198ca // smmla z10.s, z6.b, z1.b\n" "add x21, x21, #0x10\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45019892 // smmla z18.s, z4.b, z1.b\n" + ".inst 0x4501985a // smmla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x450098ce // smmla z14.s, z6.b, z0.b\n" + ".inst 0x45009896 // smmla z22.s, z4.b, z0.b\n" + ".inst 0x4500985e // smmla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x450198cb // smmla z11.s, z6.b, z1.b\n" + ".inst 0x45019893 // smmla z19.s, z4.b, z1.b\n" + ".inst 0x4501985b // smmla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x450098cf // smmla z15.s, z6.b, z0.b\n" + ".inst 0x45009897 // smmla z23.s, z4.b, z0.b\n" + ".inst 0x4500985f // smmla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n" + ".inst 0x450198b0 // smmla z16.s, z5.b, z1.b\n" + ".inst 0x45019878 // smmla z24.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098b4 // smmla z20.s, z5.b, z0.b\n" + ".inst 0x4500987c // smmla z28.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x450198e9 // smmla z9.s, z7.b, z1.b\n" + ".inst 0x450198b1 // smmla z17.s, z5.b, z1.b\n" + ".inst 0x45019879 // smmla z25.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098b5 // smmla z21.s, z5.b, z0.b\n" + ".inst 0x4500987d // smmla z29.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n" + ".inst 0x450198b2 // smmla z18.s, z5.b, z1.b\n" + ".inst 0x4501987a // smmla z26.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098b6 // smmla z22.s, z5.b, z0.b\n" + ".inst 0x4500987e // smmla z30.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x450198eb // smmla z11.s, z7.b, z1.b\n" + ".inst 0x450198b3 // smmla z19.s, z5.b, z1.b\n" + ".inst 0x4501987b // smmla z27.s, z3.b, z1.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n" + ".inst 0x4500987f // smmla z31.s, z3.b, z0.b\n" "bgt 63b\n" "64:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z7.d, z1.d, z0.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z0.d\n" + "trn1 z6.d, z3.d, z2.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079808 // smmla z8.s, z0.b, z7.b\n" - ".inst 0x45079850 // smmla z16.s, z2.b, z7.b\n" - ".inst 0x45079898 // smmla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z3.d, z3.d, z2.d\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x450298e8 // smmla z8.s, z7.b, z2.b\n" + ".inst 0x450298d0 // smmla z16.s, z6.b, z2.b\n" + ".inst 0x45029898 // smmla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - ".inst 0x4506989c // smmla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" - ".inst 0x45079851 // smmla z17.s, z2.b, z7.b\n" - ".inst 0x45079899 // smmla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x4506980d // smmla z13.s, z0.b, z6.b\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x4506989d // smmla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507980a // smmla z10.s, z0.b, z7.b\n" - ".inst 0x45079852 // smmla z18.s, z2.b, z7.b\n" - ".inst 0x4507989a // smmla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x4506980e // smmla z14.s, z0.b, z6.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x4506989e // smmla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" + ".inst 0x450098ec // smmla z12.s, z7.b, z0.b\n" + ".inst 0x450098d4 // smmla z20.s, z6.b, z0.b\n" + ".inst 0x4500989c // smmla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x450298e9 // smmla z9.s, z7.b, z2.b\n" + ".inst 0x450298d1 // smmla z17.s, z6.b, z2.b\n" + ".inst 0x45029899 // smmla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450098d5 // smmla z21.s, z6.b, z0.b\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x450298ea // smmla z10.s, z7.b, z2.b\n" + ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" + ".inst 0x4502989a // smmla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x450098ee // smmla z14.s, z7.b, z0.b\n" + ".inst 0x450098d6 // smmla z22.s, z6.b, z0.b\n" + ".inst 0x4500989e // smmla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x450298eb // smmla z11.s, z7.b, z2.b\n" "addvl x10, x10, #8\n" - ".inst 0x45079853 // smmla z19.s, z2.b, z7.b\n" - ".inst 0x4507989b // smmla z27.s, z4.b, z7.b\n" - ".inst 0x4506980f // smmla z15.s, z0.b, z6.b\n" - ".inst 0x45069857 // smmla z23.s, z2.b, z6.b\n" - ".inst 0x4506989f // smmla z31.s, z4.b, z6.b\n" + ".inst 0x450298d3 // smmla z19.s, z6.b, z2.b\n" + ".inst 0x4502989b // smmla z27.s, z4.b, z2.b\n" + ".inst 0x450098ef // smmla z15.s, z7.b, z0.b\n" + ".inst 0x450098d7 // smmla z23.s, z6.b, z0.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "ble 65f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45079828 // smmla z8.s, z1.b, z7.b\n" - ".inst 0x45079870 // smmla z16.s, z3.b, z7.b\n" - ".inst 0x450798b8 // smmla z24.s, z5.b, z7.b\n" - ".inst 0x4506982c // smmla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x45069874 // smmla z20.s, z3.b, z6.b\n" - ".inst 0x450698bc // smmla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45079829 // smmla z9.s, z1.b, z7.b\n" - ".inst 0x45079871 // smmla z17.s, z3.b, z7.b\n" - ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" - ".inst 0x4506982d // smmla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45069875 // smmla z21.s, z3.b, z6.b\n" - ".inst 0x450698bd // smmla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x4507982a // smmla z10.s, z1.b, z7.b\n" - ".inst 0x45079872 // smmla z18.s, z3.b, z7.b\n" - ".inst 0x450798ba // smmla z26.s, z5.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45069876 // smmla z22.s, z3.b, z6.b\n" - ".inst 0x450698be // smmla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45029828 // smmla z8.s, z1.b, z2.b\n" + ".inst 0x45029870 // smmla z16.s, z3.b, z2.b\n" + ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" + ".inst 0x4500982c // smmla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x45009874 // smmla z20.s, z3.b, z0.b\n" + ".inst 0x450098bc // smmla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45029829 // smmla z9.s, z1.b, z2.b\n" + ".inst 0x45029871 // smmla z17.s, z3.b, z2.b\n" + ".inst 0x450298b9 // smmla z25.s, z5.b, z2.b\n" + ".inst 0x4500982d // smmla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45009875 // smmla z21.s, z3.b, z0.b\n" + ".inst 0x450098bd // smmla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x4502982a // smmla z10.s, z1.b, z2.b\n" + ".inst 0x45029872 // smmla z18.s, z3.b, z2.b\n" + ".inst 0x450298ba // smmla z26.s, z5.b, z2.b\n" + ".inst 0x4500982e // smmla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45009876 // smmla z22.s, z3.b, z0.b\n" + ".inst 0x450098be // smmla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x4507982b // smmla z11.s, z1.b, z7.b\n" - ".inst 0x45079873 // smmla z19.s, z3.b, z7.b\n" - ".inst 0x450798bb // smmla z27.s, z5.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45069877 // smmla z23.s, z3.b, z6.b\n" - ".inst 0x450698bf // smmla z31.s, z5.b, z6.b\n" + ".inst 0x4502982b // smmla z11.s, z1.b, z2.b\n" + ".inst 0x45029873 // smmla z19.s, z3.b, z2.b\n" + ".inst 0x450298bb // smmla z27.s, z5.b, z2.b\n" + ".inst 0x4500982f // smmla z15.s, z1.b, z0.b\n" + ".inst 0x45009877 // smmla z23.s, z3.b, z0.b\n" + ".inst 0x450098bf // smmla z31.s, z5.b, z0.b\n" "65:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1596,7 +1596,7 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z0.d, z8.d, z12.d\n" "add x22, x23, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" @@ -1604,7 +1604,7 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "add x20, x21, x20, LSL #2\n" "uzp2 z9.d, z9.d, z13.d\n" "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z7.s }, p4, [x9]\n" + "st1w { z0.s }, p4, [x9]\n" "uzp2 z10.d, z10.d, z14.d\n" "uzp1 z14.d, z11.d, z15.d\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1664,7 +1664,6 @@ void sve_hybrid_s8s32_mmla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "68:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1672,4 +1671,4 @@ void sve_hybrid_s8s32_mmla_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp index c66ebedc4d..11fe5ce7e3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp index 79bd563a4b..e74b424888 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp @@ -104,11 +104,11 @@ void sve_hybrid_u8qa_dot_4x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -121,39 +121,39 @@ void sve_hybrid_u8qa_dot_4x4VL ( "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z19.s, z7.b, z0.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "udot z16.s, z20.b, z0.b[0]\n" + "ld1b { z21.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z17.s, z21.b, z0.b[0]\n" + "udot z18.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z19.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #4, MUL VL]\n" + "udot z16.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #6, MUL VL]\n" + "udot z17.s, z21.b, z0.b[1]\n" + "udot z18.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "udot z19.s, z4.b, z0.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" - "udot z16.s, z5.b, z0.b[2]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" + "udot z19.s, z20.b, z0.b[1]\n" + "ld1b { z22.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #-6, MUL VL]\n" + "udot z16.s, z22.b, z0.b[2]\n" + "udot z17.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p2/Z, [x28, #-5, MUL VL]\n" + "udot z18.s, z21.b, z0.b[2]\n" + "udot z19.s, z20.b, z0.b[2]\n" + "ld1b { z22.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #-2, MUL VL]\n" + "udot z16.s, z22.b, z0.b[3]\n" + "udot z17.s, z20.b, z0.b[3]\n" + "ld1b { z20.b }, p2/Z, [x28, #-1, MUL VL]\n" + "udot z18.s, z21.b, z0.b[3]\n" + "udot z19.s, z20.b, z0.b[3]\n" "add x24, x24, #0x10\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" @@ -164,47 +164,47 @@ void sve_hybrid_u8qa_dot_4x4VL ( "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z0.b }, p0/Z, [x24]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28]\n" "subs x25, x25, #0x4\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z19.s, z7.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z16.s, z22.b, z0.b[0]\n" + "udot z17.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z18.s, z21.b, z0.b[0]\n" + "udot z19.s, z20.b, z0.b[0]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z19.s, z4.b, z0.b[1]\n" + "udot z16.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z22.b, z0.b[1]\n" + "udot z18.s, z21.b, z0.b[1]\n" + "udot z19.s, z20.b, z0.b[1]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28]\n" + "ld1b { z22.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" + "udot z16.s, z20.b, z0.b[2]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z22.b, z0.b[2]\n" + "udot z18.s, z21.b, z0.b[2]\n" + "udot z19.s, z20.b, z0.b[2]\n" "addvl x28, x28, #4\n" "ble 10f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" + "ld1b { z21.b }, p2/Z, [x28]\n" + "ld1b { z20.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z21.b, z0.b[3]\n" + "udot z17.s, z20.b, z0.b[3]\n" + "ld1b { z21.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z20.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z18.s, z21.b, z0.b[3]\n" + "udot z19.s, z20.b, z0.b[3]\n" "addvl x28, x28, #4\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" @@ -218,71 +218,71 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z1.s }, p2/Z, [x20]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" - "neg z1.s, p2/M, z1.s\n" - "mul z11.s, p2/M, z11.s, z1.s\n" + "neg z20.s, p2/M, z20.s\n" + "mul z11.s, p2/M, z11.s, z20.s\n" "12:" // Height 1: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [x10]\n" + "ld1w { z22.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z21.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x10, #3, MUL VL]\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "add z16.s, z16.s, z23.s\n" + "add z17.s, z17.s, z22.s\n" + "add z18.s, z18.s, z21.s\n" + "add z19.s, z19.s, z20.s\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n" + ".inst 0x04b47631 // sqrdmulh z17.s, z17.s, z20.s\n" "addvl x10, x10, #4\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" + ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n" "tbz %x[flags], #5, 13f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z23.d, z16.d, z0.d\n" + "and z22.d, z17.d, z0.d\n" + "and z21.d, z18.d, z0.d\n" + "and z20.d, z19.d, z0.d\n" + "asr z23.s, z23.s, #0x1f\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "sqadd z16.s, z16.s, z23.s\n" + "sqadd z17.s, z17.s, z22.s\n" + "sqadd z18.s, z18.s, z21.s\n" + "sqadd z19.s, z19.s, z20.s\n" "13:" // Height 1: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z20.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z20.s\n" + "add z18.s, z18.s, z20.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z19.s, z19.s, z4.s\n" + "ld1rw { z21.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z20.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z21.s\n" + "smin z17.s, p2/M, z17.s, z21.s\n" + "smin z18.s, p2/M, z18.s, z21.s\n" + "smin z19.s, p2/M, z19.s, z21.s\n" + "smax z16.s, p2/M, z16.s, z20.s\n" + "smax z17.s, p2/M, z17.s, z20.s\n" + "smax z18.s, p2/M, z18.s, z20.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z20.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" "st1b { z16.b }, p1, [x27]\n" @@ -317,12 +317,12 @@ void sve_hybrid_u8qa_dot_4x4VL ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -330,7 +330,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "b 20f\n" "19:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "20:" // Height 2: input setup done "cmp x25, #0x10\n" "ble 23f\n" @@ -339,56 +339,56 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "udot z21.s, z5.b, z1.b[0]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "udot z19.s, z7.b, z0.b[0]\n" - "udot z23.s, z7.b, z1.b[0]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z24.b, z0.b[0]\n" + "udot z20.s, z24.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z26.b, z0.b[0]\n" + "udot z21.s, z26.b, z1.b[0]\n" + "udot z18.s, z24.b, z0.b[0]\n" + "udot z22.s, z24.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28, #5, MUL VL]\n" + "udot z19.s, z25.b, z0.b[0]\n" + "udot z23.s, z25.b, z1.b[0]\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "udot z16.s, z8.b, z0.b[1]\n" - "udot z20.s, z8.b, z1.b[1]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z21.s, z9.b, z1.b[1]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z22.s, z10.b, z1.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z19.s, z4.b, z0.b[1]\n" - "udot z23.s, z4.b, z1.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z5.b, z0.b[2]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" + "udot z16.s, z24.b, z0.b[1]\n" + "udot z20.s, z24.b, z1.b[1]\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + "udot z17.s, z27.b, z0.b[1]\n" + "udot z21.s, z27.b, z1.b[1]\n" + "ld1b { z30.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #-6, MUL VL]\n" + "udot z18.s, z26.b, z0.b[1]\n" + "udot z22.s, z26.b, z1.b[1]\n" + "ld1b { z28.b }, p2/Z, [x28, #-5, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "udot z19.s, z25.b, z0.b[1]\n" + "udot z23.s, z25.b, z1.b[1]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "udot z16.s, z24.b, z0.b[2]\n" + "udot z20.s, z24.b, z1.b[2]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" "add x23, x23, #0x10\n" - "udot z17.s, z6.b, z0.b[2]\n" - "udot z21.s, z6.b, z1.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "udot z23.s, z8.b, z1.b[2]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" + "udot z17.s, z30.b, z0.b[2]\n" + "udot z21.s, z30.b, z1.b[2]\n" + "udot z18.s, z29.b, z0.b[2]\n" + "udot z22.s, z29.b, z1.b[2]\n" + "udot z19.s, z28.b, z0.b[2]\n" + "udot z23.s, z28.b, z1.b[2]\n" + "udot z16.s, z27.b, z0.b[3]\n" + "udot z20.s, z27.b, z1.b[3]\n" + "udot z17.s, z26.b, z0.b[3]\n" + "udot z21.s, z26.b, z1.b[3]\n" + "udot z18.s, z25.b, z0.b[3]\n" + "udot z22.s, z25.b, z1.b[3]\n" + "udot z19.s, z24.b, z0.b[3]\n" + "udot z23.s, z24.b, z1.b[3]\n" "tbnz %x[flags], #31, 22f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" @@ -401,63 +401,63 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z0.b }, p0/Z, [x24]\n" "ld1rqb { z1.b }, p0/Z, [x23]\n" "subs x25, x25, #0x4\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "udot z21.s, z5.b, z1.b[0]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z24.b, z0.b[0]\n" + "udot z20.s, z24.b, z1.b[0]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z26.b, z0.b[0]\n" + "udot z21.s, z26.b, z1.b[0]\n" + "udot z18.s, z25.b, z0.b[0]\n" + "udot z22.s, z25.b, z1.b[0]\n" "addvl x28, x28, #4\n" - "udot z19.s, z7.b, z0.b[0]\n" - "udot z23.s, z7.b, z1.b[0]\n" + "udot z19.s, z24.b, z0.b[0]\n" + "udot z23.s, z24.b, z1.b[0]\n" "ble 24f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[1]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z21.s, z9.b, z1.b[1]\n" - "udot z18.s, z10.b, z0.b[1]\n" + "udot z16.s, z27.b, z0.b[1]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z27.b, z1.b[1]\n" + "udot z17.s, z26.b, z0.b[1]\n" + "udot z21.s, z26.b, z1.b[1]\n" + "udot z18.s, z25.b, z0.b[1]\n" "addvl x28, x28, #4\n" - "udot z22.s, z10.b, z1.b[1]\n" - "udot z19.s, z4.b, z0.b[1]\n" - "udot z23.s, z4.b, z1.b[1]\n" + "udot z22.s, z25.b, z1.b[1]\n" + "udot z19.s, z24.b, z0.b[1]\n" + "udot z23.s, z24.b, z1.b[1]\n" "ble 24f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "udot z21.s, z6.b, z1.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" + "udot z16.s, z27.b, z0.b[2]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z27.b, z1.b[2]\n" + "udot z17.s, z26.b, z0.b[2]\n" + "udot z21.s, z26.b, z1.b[2]\n" + "udot z18.s, z25.b, z0.b[2]\n" "addvl x28, x28, #4\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "udot z23.s, z8.b, z1.b[2]\n" + "udot z22.s, z25.b, z1.b[2]\n" + "udot z19.s, z24.b, z0.b[2]\n" + "udot z23.s, z24.b, z1.b[2]\n" "ble 24f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z24.b, z0.b[3]\n" + "udot z20.s, z24.b, z1.b[3]\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z17.s, z26.b, z0.b[3]\n" + "udot z21.s, z26.b, z1.b[3]\n" + "udot z18.s, z25.b, z0.b[3]\n" + "udot z22.s, z25.b, z1.b[3]\n" "addvl x28, x28, #4\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" + "udot z19.s, z24.b, z0.b[3]\n" + "udot z23.s, z24.b, z1.b[3]\n" "24:" // Height 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 25f\n" "udot z11.s, z0.b, z15.b\n" @@ -473,120 +473,120 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z2.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "uaddv d12, p0, z12.s\n" - "neg z2.s, p2/M, z2.s\n" + "neg z24.s, p2/M, z24.s\n" "mov z12.s, z12.s[0]\n" - "mul z11.s, p2/M, z11.s, z2.s\n" - "mul z12.s, p2/M, z12.s, z2.s\n" + "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z12.s, p2/M, z12.s, z24.s\n" "26:" // Height 2: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10]\n" + "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" + "add z16.s, z16.s, z28.s\n" + "add z17.s, z17.s, z27.s\n" "addvl x10, x10, #4\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" + "add z18.s, z18.s, z26.s\n" + "add z19.s, z19.s, z25.s\n" + "add z20.s, z20.s, z28.s\n" + "add z21.s, z21.s, z27.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" + "add z22.s, z22.s, z26.s\n" + "add z23.s, z23.s, z25.s\n" + ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" + ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" + ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" + ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" + ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" + ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" + ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" + ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" "tbz %x[flags], #5, 27f\n" - "and z4.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" + "and z24.d, z16.d, z0.d\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z16.s, z16.s, z24.s\n" + "and z30.d, z17.d, z0.d\n" + "and z29.d, z18.d, z0.d\n" + "and z28.d, z19.d, z0.d\n" + "and z27.d, z20.d, z0.d\n" + "and z26.d, z21.d, z0.d\n" + "and z25.d, z22.d, z0.d\n" + "and z24.d, z23.d, z0.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z17.s, z17.s, z30.s\n" + "sqadd z18.s, z18.s, z29.s\n" + "sqadd z19.s, z19.s, z28.s\n" + "sqadd z20.s, z20.s, z27.s\n" + "sqadd z21.s, z21.s, z26.s\n" + "sqadd z22.s, z22.s, z25.s\n" + "sqadd z23.s, z23.s, z24.s\n" "27:" // Height 2: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z24.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z24.s\n" + "add z18.s, z18.s, z24.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z24.s\n" + "add z20.s, z20.s, z24.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z24.s\n" + "add z22.s, z22.s, z24.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z23.s, z23.s, z4.s\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "add z23.s, z23.s, z24.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z25.s\n" + "smin z17.s, p2/M, z17.s, z25.s\n" + "smin z18.s, p2/M, z18.s, z25.s\n" + "smin z19.s, p2/M, z19.s, z25.s\n" + "smin z20.s, p2/M, z20.s, z25.s\n" + "smin z21.s, p2/M, z21.s, z25.s\n" + "smin z22.s, p2/M, z22.s, z25.s\n" + "smin z23.s, p2/M, z23.s, z25.s\n" + "smax z16.s, p2/M, z16.s, z24.s\n" + "smax z17.s, p2/M, z17.s, z24.s\n" + "smax z18.s, p2/M, z18.s, z24.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z24.s\n" + "smax z20.s, p2/M, z20.s, z24.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z24.s\n" + "smax z22.s, p2/M, z22.s, z24.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" + "smax z23.s, p2/M, z23.s, z24.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" "st1b { z20.b }, p1, [x23]\n" "addvl x27, x27, #1\n" "28:" // Height 2: Writeback done @@ -624,13 +624,13 @@ void sve_hybrid_u8qa_dot_4x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -639,8 +639,8 @@ void sve_hybrid_u8qa_dot_4x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "34:" // Height 3: input setup done "cmp x25, #0x10\n" "ble 37f\n" @@ -650,73 +650,73 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z1.b }, p0/Z, [x23]\n" "add x24, x24, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "udot z24.s, z4.b, z2.b[0]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z21.s, z5.b, z1.b[0]\n" - "udot z25.s, z5.b, z2.b[0]\n" - "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "udot z26.s, z6.b, z2.b[0]\n" - "udot z19.s, z7.b, z0.b[0]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28]\n" + "udot z16.s, z28.b, z0.b[0]\n" + "udot z20.s, z28.b, z1.b[0]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z28.b, z2.b[0]\n" + "udot z17.s, z30.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z21.s, z30.b, z1.b[0]\n" + "udot z25.s, z30.b, z2.b[0]\n" + "ld1b { z3.b }, p2/Z, [x28, #4, MUL VL]\n" + "udot z18.s, z29.b, z0.b[0]\n" + "udot z22.s, z29.b, z1.b[0]\n" + "ld1b { z31.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z30.b }, p2/Z, [x28, #6, MUL VL]\n" + "udot z26.s, z29.b, z2.b[0]\n" + "udot z19.s, z28.b, z0.b[0]\n" + "ld1b { z29.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "udot z23.s, z7.b, z1.b[0]\n" - "udot z27.s, z7.b, z2.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" - "udot z16.s, z8.b, z0.b[1]\n" - "udot z20.s, z8.b, z1.b[1]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + "udot z23.s, z28.b, z1.b[0]\n" + "udot z27.s, z28.b, z2.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #-8, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #-7, MUL VL]\n" + "udot z16.s, z3.b, z0.b[1]\n" + "udot z20.s, z3.b, z1.b[1]\n" + "ld1b { z4.b }, p2/Z, [x28, #-6, MUL VL]\n" "add x23, x23, #0x10\n" - "udot z24.s, z8.b, z2.b[1]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + "udot z24.s, z3.b, z2.b[1]\n" + "udot z17.s, z31.b, z0.b[1]\n" + "ld1b { z3.b }, p2/Z, [x28, #-5, MUL VL]\n" "add x22, x22, #0x10\n" - "udot z21.s, z9.b, z1.b[1]\n" - "udot z25.s, z9.b, z2.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z22.s, z10.b, z1.b[1]\n" - "udot z26.s, z10.b, z2.b[1]\n" - "udot z19.s, z4.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z23.s, z4.b, z1.b[1]\n" - "udot z27.s, z4.b, z2.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z5.b, z0.b[2]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "udot z24.s, z5.b, z2.b[2]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z21.s, z6.b, z1.b[2]\n" - "udot z25.s, z6.b, z2.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z26.s, z7.b, z2.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "udot z23.s, z8.b, z1.b[2]\n" - "udot z27.s, z8.b, z2.b[2]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "udot z24.s, z9.b, z2.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" - "udot z25.s, z10.b, z2.b[3]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" - "udot z26.s, z4.b, z2.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" - "udot z27.s, z5.b, z2.b[3]\n" + "udot z21.s, z31.b, z1.b[1]\n" + "udot z25.s, z31.b, z2.b[1]\n" + "ld1b { z31.b }, p2/Z, [x28, #-4, MUL VL]\n" + "udot z18.s, z30.b, z0.b[1]\n" + "udot z22.s, z30.b, z1.b[1]\n" + "udot z26.s, z30.b, z2.b[1]\n" + "udot z19.s, z29.b, z0.b[1]\n" + "ld1b { z30.b }, p2/Z, [x28, #-3, MUL VL]\n" + "udot z23.s, z29.b, z1.b[1]\n" + "udot z27.s, z29.b, z2.b[1]\n" + "ld1b { z29.b }, p2/Z, [x28, #-2, MUL VL]\n" + "udot z16.s, z28.b, z0.b[2]\n" + "udot z20.s, z28.b, z1.b[2]\n" + "udot z24.s, z28.b, z2.b[2]\n" + "udot z17.s, z5.b, z0.b[2]\n" + "ld1b { z28.b }, p2/Z, [x28, #-1, MUL VL]\n" + "udot z21.s, z5.b, z1.b[2]\n" + "udot z25.s, z5.b, z2.b[2]\n" + "udot z18.s, z4.b, z0.b[2]\n" + "udot z22.s, z4.b, z1.b[2]\n" + "udot z26.s, z4.b, z2.b[2]\n" + "udot z19.s, z3.b, z0.b[2]\n" + "udot z23.s, z3.b, z1.b[2]\n" + "udot z27.s, z3.b, z2.b[2]\n" + "udot z16.s, z31.b, z0.b[3]\n" + "udot z20.s, z31.b, z1.b[3]\n" + "udot z24.s, z31.b, z2.b[3]\n" + "udot z17.s, z30.b, z0.b[3]\n" + "udot z21.s, z30.b, z1.b[3]\n" + "udot z25.s, z30.b, z2.b[3]\n" + "udot z18.s, z29.b, z0.b[3]\n" + "udot z22.s, z29.b, z1.b[3]\n" + "udot z26.s, z29.b, z2.b[3]\n" + "udot z19.s, z28.b, z0.b[3]\n" + "udot z23.s, z28.b, z1.b[3]\n" + "udot z27.s, z28.b, z2.b[3]\n" "tbnz %x[flags], #31, 36f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" @@ -731,79 +731,79 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z1.b }, p0/Z, [x23]\n" "subs x25, x25, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "udot z24.s, z4.b, z2.b[0]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z21.s, z5.b, z1.b[0]\n" - "udot z25.s, z5.b, z2.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28]\n" + "udot z16.s, z28.b, z0.b[0]\n" + "udot z20.s, z28.b, z1.b[0]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z24.s, z28.b, z2.b[0]\n" + "udot z17.s, z30.b, z0.b[0]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z21.s, z30.b, z1.b[0]\n" + "udot z25.s, z30.b, z2.b[0]\n" "addvl x28, x28, #4\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" - "udot z26.s, z6.b, z2.b[0]\n" - "udot z19.s, z7.b, z0.b[0]\n" - "udot z23.s, z7.b, z1.b[0]\n" - "udot z27.s, z7.b, z2.b[0]\n" + "udot z18.s, z29.b, z0.b[0]\n" + "udot z22.s, z29.b, z1.b[0]\n" + "udot z26.s, z29.b, z2.b[0]\n" + "udot z19.s, z28.b, z0.b[0]\n" + "udot z23.s, z28.b, z1.b[0]\n" + "udot z27.s, z28.b, z2.b[0]\n" "ble 38f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[1]\n" - "udot z24.s, z8.b, z2.b[1]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z21.s, z9.b, z1.b[1]\n" + "udot z16.s, z31.b, z0.b[1]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z31.b, z1.b[1]\n" + "udot z24.s, z31.b, z2.b[1]\n" + "udot z17.s, z30.b, z0.b[1]\n" + "udot z21.s, z30.b, z1.b[1]\n" "addvl x28, x28, #4\n" - "udot z25.s, z9.b, z2.b[1]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z22.s, z10.b, z1.b[1]\n" - "udot z26.s, z10.b, z2.b[1]\n" - "udot z19.s, z4.b, z0.b[1]\n" - "udot z23.s, z4.b, z1.b[1]\n" - "udot z27.s, z4.b, z2.b[1]\n" + "udot z25.s, z30.b, z2.b[1]\n" + "udot z18.s, z29.b, z0.b[1]\n" + "udot z22.s, z29.b, z1.b[1]\n" + "udot z26.s, z29.b, z2.b[1]\n" + "udot z19.s, z28.b, z0.b[1]\n" + "udot z23.s, z28.b, z1.b[1]\n" + "udot z27.s, z28.b, z2.b[1]\n" "ble 38f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "udot z24.s, z5.b, z2.b[2]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "udot z21.s, z6.b, z1.b[2]\n" + "udot z16.s, z31.b, z0.b[2]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z31.b, z1.b[2]\n" + "udot z24.s, z31.b, z2.b[2]\n" + "udot z17.s, z30.b, z0.b[2]\n" + "udot z21.s, z30.b, z1.b[2]\n" "addvl x28, x28, #4\n" - "udot z25.s, z6.b, z2.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z26.s, z7.b, z2.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "udot z23.s, z8.b, z1.b[2]\n" - "udot z27.s, z8.b, z2.b[2]\n" + "udot z25.s, z30.b, z2.b[2]\n" + "udot z18.s, z29.b, z0.b[2]\n" + "udot z22.s, z29.b, z1.b[2]\n" + "udot z26.s, z29.b, z2.b[2]\n" + "udot z19.s, z28.b, z0.b[2]\n" + "udot z23.s, z28.b, z1.b[2]\n" + "udot z27.s, z28.b, z2.b[2]\n" "ble 38f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z24.s, z9.b, z2.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" - "udot z25.s, z10.b, z2.b[3]\n" + "ld1b { z31.b }, p2/Z, [x28]\n" + "ld1b { z30.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z31.b, z0.b[3]\n" + "udot z20.s, z31.b, z1.b[3]\n" + "ld1b { z29.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z28.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z24.s, z31.b, z2.b[3]\n" + "udot z17.s, z30.b, z0.b[3]\n" + "udot z21.s, z30.b, z1.b[3]\n" + "udot z25.s, z30.b, z2.b[3]\n" "addvl x28, x28, #4\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" - "udot z26.s, z4.b, z2.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" - "udot z27.s, z5.b, z2.b[3]\n" + "udot z18.s, z29.b, z0.b[3]\n" + "udot z22.s, z29.b, z1.b[3]\n" + "udot z26.s, z29.b, z2.b[3]\n" + "udot z19.s, z28.b, z0.b[3]\n" + "udot z23.s, z28.b, z1.b[3]\n" + "udot z27.s, z28.b, z2.b[3]\n" "38:" // Height 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 39f\n" "udot z11.s, z0.b, z15.b\n" @@ -821,33 +821,33 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z3.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "uaddv d12, p0, z12.s\n" "uaddv d13, p0, z13.s\n" "mov z12.s, z12.s[0]\n" "mov z13.s, z13.s[0]\n" - "neg z3.s, p2/M, z3.s\n" - "mul z11.s, p2/M, z11.s, z3.s\n" - "mul z12.s, p2/M, z12.s, z3.s\n" - "mul z13.s, p2/M, z13.s, z3.s\n" + "neg z28.s, p2/M, z28.s\n" + "mul z11.s, p2/M, z11.s, z28.s\n" + "mul z12.s, p2/M, z12.s, z28.s\n" + "mul z13.s, p2/M, z13.s, z28.s\n" "40:" // Height 3: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z31.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" @@ -855,133 +855,133 @@ void sve_hybrid_u8qa_dot_4x4VL ( "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "add z17.s, z17.s, z31.s\n" + "add z18.s, z18.s, z30.s\n" + "add z19.s, z19.s, z29.s\n" "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" + "add z21.s, z21.s, z31.s\n" + "add z22.s, z22.s, z30.s\n" + "add z23.s, z23.s, z29.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" + "add z25.s, z25.s, z31.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z26.s, z26.s, z30.s\n" + "add z27.s, z27.s, z29.s\n" + ".inst 0x04bc7610 // sqrdmulh z16.s, z16.s, z28.s\n" + ".inst 0x04bc7631 // sqrdmulh z17.s, z17.s, z28.s\n" + ".inst 0x04bc7652 // sqrdmulh z18.s, z18.s, z28.s\n" + ".inst 0x04bc7673 // sqrdmulh z19.s, z19.s, z28.s\n" + ".inst 0x04bc7694 // sqrdmulh z20.s, z20.s, z28.s\n" + ".inst 0x04bc76b5 // sqrdmulh z21.s, z21.s, z28.s\n" + ".inst 0x04bc76d6 // sqrdmulh z22.s, z22.s, z28.s\n" + ".inst 0x04bc76f7 // sqrdmulh z23.s, z23.s, z28.s\n" + ".inst 0x04bc7718 // sqrdmulh z24.s, z24.s, z28.s\n" + ".inst 0x04bc7739 // sqrdmulh z25.s, z25.s, z28.s\n" + ".inst 0x04bc775a // sqrdmulh z26.s, z26.s, z28.s\n" + ".inst 0x04bc777b // sqrdmulh z27.s, z27.s, z28.s\n" "tbz %x[flags], #5, 41f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" + "and z1.d, z16.d, z0.d\n" + "and z31.d, z17.d, z0.d\n" + "and z30.d, z18.d, z0.d\n" + "and z29.d, z19.d, z0.d\n" + "and z28.d, z20.d, z0.d\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z16.s, z16.s, z1.s\n" + "sqadd z17.s, z17.s, z31.s\n" + "sqadd z18.s, z18.s, z30.s\n" + "sqadd z19.s, z19.s, z29.s\n" + "sqadd z20.s, z20.s, z28.s\n" + "and z3.d, z21.d, z0.d\n" + "and z2.d, z22.d, z0.d\n" + "and z1.d, z23.d, z0.d\n" + "and z31.d, z24.d, z0.d\n" + "and z30.d, z25.d, z0.d\n" + "and z29.d, z26.d, z0.d\n" + "and z28.d, z27.d, z0.d\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z31.s, z31.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "sqadd z21.s, z21.s, z3.s\n" + "sqadd z22.s, z22.s, z2.s\n" + "sqadd z23.s, z23.s, z1.s\n" + "sqadd z24.s, z24.s, z31.s\n" + "sqadd z25.s, z25.s, z30.s\n" + "sqadd z26.s, z26.s, z29.s\n" + "sqadd z27.s, z27.s, z28.s\n" "41:" // Height 3: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z28.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z28.s\n" + "add z18.s, z18.s, z28.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z28.s\n" + "add z20.s, z20.s, z28.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z28.s\n" + "add z22.s, z22.s, z28.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z23.s, z23.s, z28.s\n" + "add z24.s, z24.s, z28.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z28.s\n" + "add z26.s, z26.s, z28.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z27.s, z27.s, z4.s\n" + "ld1rw { z29.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z28.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z29.s\n" + "smin z17.s, p2/M, z17.s, z29.s\n" + "smin z18.s, p2/M, z18.s, z29.s\n" + "smin z19.s, p2/M, z19.s, z29.s\n" + "smin z20.s, p2/M, z20.s, z29.s\n" + "smin z21.s, p2/M, z21.s, z29.s\n" + "smin z22.s, p2/M, z22.s, z29.s\n" + "smin z23.s, p2/M, z23.s, z29.s\n" + "smin z24.s, p2/M, z24.s, z29.s\n" + "smin z25.s, p2/M, z25.s, z29.s\n" + "smin z26.s, p2/M, z26.s, z29.s\n" + "smin z27.s, p2/M, z27.s, z29.s\n" + "smax z16.s, p2/M, z16.s, z28.s\n" + "smax z17.s, p2/M, z17.s, z28.s\n" + "smax z18.s, p2/M, z18.s, z28.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z28.s\n" + "smax z20.s, p2/M, z20.s, z28.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z28.s\n" + "smax z22.s, p2/M, z22.s, z28.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z28.s\n" + "smax z24.s, p2/M, z24.s, z28.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z28.s\n" + "smax z26.s, p2/M, z26.s, z28.s\n" "uzp1 z24.h, z24.h, z25.h\n" "st1b { z20.b }, p1, [x23]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" + "smax z27.s, p2/M, z27.s, z28.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" "st1b { z24.b }, p1, [x22]\n" "addvl x27, x27, #1\n" "42:" // Height 3: Writeback done @@ -1027,14 +1027,14 @@ void sve_hybrid_u8qa_dot_4x4VL ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1044,9 +1044,9 @@ void sve_hybrid_u8qa_dot_4x4VL ( "b 48f\n" "47:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "48:" // Height 4: input setup done "cmp x25, #0x10\n" "ble 51f\n" @@ -1059,88 +1059,88 @@ void sve_hybrid_u8qa_dot_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x21]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z24.s, z4.b, z2.b[0]\n" - "udot z28.s, z4.b, z3.b[0]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x28]\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z5.b, z0.b[0]\n" + "udot z20.s, z5.b, z1.b[0]\n" + "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z24.s, z5.b, z2.b[0]\n" + "udot z28.s, z5.b, z3.b[0]\n" + "udot z17.s, z4.b, z0.b[0]\n" + "udot z21.s, z4.b, z1.b[0]\n" "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n" - "udot z25.s, z5.b, z2.b[0]\n" - "udot z29.s, z5.b, z3.b[0]\n" - "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "udot z25.s, z4.b, z2.b[0]\n" + "udot z29.s, z4.b, z3.b[0]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" "addvl x28, x28, #16\n" - "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n" - "udot z26.s, z6.b, z2.b[0]\n" - "udot z30.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" "add x21, x21, #0x10\n" - "udot z19.s, z7.b, z0.b[0]\n" - "udot z23.s, z7.b, z1.b[0]\n" - "udot z27.s, z7.b, z2.b[0]\n" - "udot z31.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n" + "udot z19.s, z9.b, z0.b[0]\n" + "udot z23.s, z9.b, z1.b[0]\n" + "udot z27.s, z9.b, z2.b[0]\n" + "udot z31.s, z9.b, z3.b[0]\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" "udot z16.s, z8.b, z0.b[1]\n" "udot z20.s, z8.b, z1.b[1]\n" "udot z24.s, z8.b, z2.b[1]\n" "udot z28.s, z8.b, z3.b[1]\n" "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" - "udot z17.s, z9.b, z0.b[1]\n" - "udot z21.s, z9.b, z1.b[1]\n" - "udot z25.s, z9.b, z2.b[1]\n" - "udot z29.s, z9.b, z3.b[1]\n" - "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z22.s, z10.b, z1.b[1]\n" - "udot z26.s, z10.b, z2.b[1]\n" - "udot z30.s, z10.b, z3.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n" - "udot z19.s, z4.b, z0.b[1]\n" - "udot z23.s, z4.b, z1.b[1]\n" - "udot z27.s, z4.b, z2.b[1]\n" - "udot z31.s, z4.b, z3.b[1]\n" - "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n" - "udot z16.s, z5.b, z0.b[2]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "udot z24.s, z5.b, z2.b[2]\n" - "udot z28.s, z5.b, z3.b[2]\n" - "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n" - "udot z17.s, z6.b, z0.b[2]\n" - "udot z21.s, z6.b, z1.b[2]\n" - "udot z25.s, z6.b, z2.b[2]\n" - "udot z29.s, z6.b, z3.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z26.s, z7.b, z2.b[2]\n" - "udot z30.s, z7.b, z3.b[2]\n" + "udot z17.s, z7.b, z0.b[1]\n" + "udot z21.s, z7.b, z1.b[1]\n" + "udot z25.s, z7.b, z2.b[1]\n" + "udot z29.s, z7.b, z3.b[1]\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + "udot z18.s, z6.b, z0.b[1]\n" + "udot z22.s, z6.b, z1.b[1]\n" + "udot z26.s, z6.b, z2.b[1]\n" + "udot z30.s, z6.b, z3.b[1]\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" + "udot z19.s, z5.b, z0.b[1]\n" + "udot z23.s, z5.b, z1.b[1]\n" + "udot z27.s, z5.b, z2.b[1]\n" + "udot z31.s, z5.b, z3.b[1]\n" + "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" + "udot z16.s, z4.b, z0.b[2]\n" + "udot z20.s, z4.b, z1.b[2]\n" + "udot z24.s, z4.b, z2.b[2]\n" + "udot z28.s, z4.b, z3.b[2]\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + "udot z17.s, z10.b, z0.b[2]\n" + "udot z21.s, z10.b, z1.b[2]\n" + "udot z25.s, z10.b, z2.b[2]\n" + "udot z29.s, z10.b, z3.b[2]\n" + "udot z18.s, z9.b, z0.b[2]\n" + "udot z22.s, z9.b, z1.b[2]\n" + "udot z26.s, z9.b, z2.b[2]\n" + "udot z30.s, z9.b, z3.b[2]\n" "udot z19.s, z8.b, z0.b[2]\n" "udot z23.s, z8.b, z1.b[2]\n" "udot z27.s, z8.b, z2.b[2]\n" "udot z31.s, z8.b, z3.b[2]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "udot z24.s, z9.b, z2.b[3]\n" - "udot z28.s, z9.b, z3.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" - "udot z25.s, z10.b, z2.b[3]\n" - "udot z29.s, z10.b, z3.b[3]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" - "udot z26.s, z4.b, z2.b[3]\n" - "udot z30.s, z4.b, z3.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" - "udot z27.s, z5.b, z2.b[3]\n" - "udot z31.s, z5.b, z3.b[3]\n" + "udot z16.s, z7.b, z0.b[3]\n" + "udot z20.s, z7.b, z1.b[3]\n" + "udot z24.s, z7.b, z2.b[3]\n" + "udot z28.s, z7.b, z3.b[3]\n" + "udot z17.s, z6.b, z0.b[3]\n" + "udot z21.s, z6.b, z1.b[3]\n" + "udot z25.s, z6.b, z2.b[3]\n" + "udot z29.s, z6.b, z3.b[3]\n" + "udot z18.s, z5.b, z0.b[3]\n" + "udot z22.s, z5.b, z1.b[3]\n" + "udot z26.s, z5.b, z2.b[3]\n" + "udot z30.s, z5.b, z3.b[3]\n" + "udot z19.s, z4.b, z0.b[3]\n" + "udot z23.s, z4.b, z1.b[3]\n" + "udot z27.s, z4.b, z2.b[3]\n" + "udot z31.s, z4.b, z3.b[3]\n" "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z12.s, z1.b, z15.b\n" @@ -1157,95 +1157,95 @@ void sve_hybrid_u8qa_dot_4x4VL ( "subs x25, x25, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x22]\n" "ld1rqb { z3.b }, p0/Z, [x21]\n" - "ld1b { z4.b }, p2/Z, [x28]\n" - "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z4.b, z0.b[0]\n" - "udot z20.s, z4.b, z1.b[0]\n" - "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z24.s, z4.b, z2.b[0]\n" - "udot z28.s, z4.b, z3.b[0]\n" - "udot z17.s, z5.b, z0.b[0]\n" - "udot z21.s, z5.b, z1.b[0]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z7.b, z0.b[0]\n" + "udot z20.s, z7.b, z1.b[0]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z24.s, z7.b, z2.b[0]\n" + "udot z28.s, z7.b, z3.b[0]\n" + "udot z17.s, z6.b, z0.b[0]\n" + "udot z21.s, z6.b, z1.b[0]\n" "addvl x28, x28, #4\n" - "udot z25.s, z5.b, z2.b[0]\n" - "udot z29.s, z5.b, z3.b[0]\n" - "udot z18.s, z6.b, z0.b[0]\n" - "udot z22.s, z6.b, z1.b[0]\n" - "udot z26.s, z6.b, z2.b[0]\n" - "udot z30.s, z6.b, z3.b[0]\n" - "udot z19.s, z7.b, z0.b[0]\n" - "udot z23.s, z7.b, z1.b[0]\n" - "udot z27.s, z7.b, z2.b[0]\n" - "udot z31.s, z7.b, z3.b[0]\n" + "udot z25.s, z6.b, z2.b[0]\n" + "udot z29.s, z6.b, z3.b[0]\n" + "udot z18.s, z5.b, z0.b[0]\n" + "udot z22.s, z5.b, z1.b[0]\n" + "udot z26.s, z5.b, z2.b[0]\n" + "udot z30.s, z5.b, z3.b[0]\n" + "udot z19.s, z4.b, z0.b[0]\n" + "udot z23.s, z4.b, z1.b[0]\n" + "udot z27.s, z4.b, z2.b[0]\n" + "udot z31.s, z4.b, z3.b[0]\n" "ble 52f\n" - "ld1b { z8.b }, p2/Z, [x28]\n" - "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z8.b, z0.b[1]\n" - "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n" + "udot z16.s, z7.b, z0.b[1]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z8.b, z1.b[1]\n" - "udot z24.s, z8.b, z2.b[1]\n" - "udot z28.s, z8.b, z3.b[1]\n" - "udot z17.s, z9.b, z0.b[1]\n" + "udot z20.s, z7.b, z1.b[1]\n" + "udot z24.s, z7.b, z2.b[1]\n" + "udot z28.s, z7.b, z3.b[1]\n" + "udot z17.s, z6.b, z0.b[1]\n" "addvl x28, x28, #4\n" - "udot z21.s, z9.b, z1.b[1]\n" - "udot z25.s, z9.b, z2.b[1]\n" - "udot z29.s, z9.b, z3.b[1]\n" - "udot z18.s, z10.b, z0.b[1]\n" - "udot z22.s, z10.b, z1.b[1]\n" - "udot z26.s, z10.b, z2.b[1]\n" - "udot z30.s, z10.b, z3.b[1]\n" + "udot z21.s, z6.b, z1.b[1]\n" + "udot z25.s, z6.b, z2.b[1]\n" + "udot z29.s, z6.b, z3.b[1]\n" + "udot z18.s, z5.b, z0.b[1]\n" + "udot z22.s, z5.b, z1.b[1]\n" + "udot z26.s, z5.b, z2.b[1]\n" + "udot z30.s, z5.b, z3.b[1]\n" "udot z19.s, z4.b, z0.b[1]\n" "udot z23.s, z4.b, z1.b[1]\n" "udot z27.s, z4.b, z2.b[1]\n" "udot z31.s, z4.b, z3.b[1]\n" "ble 52f\n" - "ld1b { z5.b }, p2/Z, [x28]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x4\n" - "udot z16.s, z5.b, z0.b[2]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z20.s, z5.b, z1.b[2]\n" - "udot z24.s, z5.b, z2.b[2]\n" - "udot z28.s, z5.b, z3.b[2]\n" + "udot z16.s, z7.b, z0.b[2]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z20.s, z7.b, z1.b[2]\n" + "udot z24.s, z7.b, z2.b[2]\n" + "udot z28.s, z7.b, z3.b[2]\n" "udot z17.s, z6.b, z0.b[2]\n" "addvl x28, x28, #4\n" "udot z21.s, z6.b, z1.b[2]\n" "udot z25.s, z6.b, z2.b[2]\n" "udot z29.s, z6.b, z3.b[2]\n" - "udot z18.s, z7.b, z0.b[2]\n" - "udot z22.s, z7.b, z1.b[2]\n" - "udot z26.s, z7.b, z2.b[2]\n" - "udot z30.s, z7.b, z3.b[2]\n" - "udot z19.s, z8.b, z0.b[2]\n" - "udot z23.s, z8.b, z1.b[2]\n" - "udot z27.s, z8.b, z2.b[2]\n" - "udot z31.s, z8.b, z3.b[2]\n" + "udot z18.s, z5.b, z0.b[2]\n" + "udot z22.s, z5.b, z1.b[2]\n" + "udot z26.s, z5.b, z2.b[2]\n" + "udot z30.s, z5.b, z3.b[2]\n" + "udot z19.s, z4.b, z0.b[2]\n" + "udot z23.s, z4.b, z1.b[2]\n" + "udot z27.s, z4.b, z2.b[2]\n" + "udot z31.s, z4.b, z3.b[2]\n" "ble 52f\n" - "ld1b { z9.b }, p2/Z, [x28]\n" - "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n" - "udot z16.s, z9.b, z0.b[3]\n" - "udot z20.s, z9.b, z1.b[3]\n" - "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n" - "udot z24.s, z9.b, z2.b[3]\n" - "udot z28.s, z9.b, z3.b[3]\n" - "udot z17.s, z10.b, z0.b[3]\n" - "udot z21.s, z10.b, z1.b[3]\n" + "ld1b { z7.b }, p2/Z, [x28]\n" + "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "udot z16.s, z7.b, z0.b[3]\n" + "udot z20.s, z7.b, z1.b[3]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "udot z24.s, z7.b, z2.b[3]\n" + "udot z28.s, z7.b, z3.b[3]\n" + "udot z17.s, z6.b, z0.b[3]\n" + "udot z21.s, z6.b, z1.b[3]\n" "addvl x28, x28, #4\n" - "udot z25.s, z10.b, z2.b[3]\n" - "udot z29.s, z10.b, z3.b[3]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z22.s, z4.b, z1.b[3]\n" - "udot z26.s, z4.b, z2.b[3]\n" - "udot z30.s, z4.b, z3.b[3]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z23.s, z5.b, z1.b[3]\n" - "udot z27.s, z5.b, z2.b[3]\n" - "udot z31.s, z5.b, z3.b[3]\n" + "udot z25.s, z6.b, z2.b[3]\n" + "udot z29.s, z6.b, z3.b[3]\n" + "udot z18.s, z5.b, z0.b[3]\n" + "udot z22.s, z5.b, z1.b[3]\n" + "udot z26.s, z5.b, z2.b[3]\n" + "udot z30.s, z5.b, z3.b[3]\n" + "udot z19.s, z4.b, z0.b[3]\n" + "udot z23.s, z4.b, z1.b[3]\n" + "udot z27.s, z4.b, z2.b[3]\n" + "udot z31.s, z4.b, z3.b[3]\n" "52:" // Height 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 53f\n" "udot z11.s, z0.b, z15.b\n" @@ -1265,7 +1265,7 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov x20, #0x4\n" "whilelt p0.s, XZR, x20\n" "add x20, %x[qp], %[b_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" "uaddv d11, p0, z11.s\n" "mov z11.s, z11.s[0]\n" "uaddv d12, p0, z12.s\n" @@ -1273,28 +1273,28 @@ void sve_hybrid_u8qa_dot_4x4VL ( "mov z12.s, z12.s[0]\n" "mov z13.s, z13.s[0]\n" "uaddv d14, p0, z14.s\n" - "neg z4.s, p2/M, z4.s\n" + "neg z0.s, p2/M, z0.s\n" "mov z14.s, z14.s[0]\n" - "mul z11.s, p2/M, z11.s, z4.s\n" - "mul z12.s, p2/M, z12.s, z4.s\n" - "mul z13.s, p2/M, z13.s, z4.s\n" - "mul z14.s, p2/M, z14.s, z4.s\n" + "mul z11.s, p2/M, z11.s, z0.s\n" + "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z13.s, p2/M, z13.s, z0.s\n" + "mul z14.s, p2/M, z14.s, z0.s\n" "54:" // Height 4: skip row sum fixup "add z16.s, z16.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" "add z20.s, z20.s, z12.s\n" "add z21.s, z21.s, z12.s\n" "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z22.s, z22.s, z12.s\n" "add z23.s, z23.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" @@ -1305,174 +1305,174 @@ void sve_hybrid_u8qa_dot_4x4VL ( "add z29.s, z29.s, z14.s\n" "add z30.s, z30.s, z14.s\n" "add z31.s, z31.s, z14.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z20.s, z20.s, z0.s\n" - "add z21.s, z21.s, z1.s\n" - "add z22.s, z22.s, z2.s\n" - "add z23.s, z23.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - "add z28.s, z28.s, z0.s\n" - "add z29.s, z29.s, z1.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z0.s\n" + "add z18.s, z18.s, z3.s\n" + "add z19.s, z19.s, z2.s\n" + "add z20.s, z20.s, z4.s\n" + "add z21.s, z21.s, z0.s\n" + "add z22.s, z22.s, z3.s\n" + "add z23.s, z23.s, z2.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z0.s\n" + "add z26.s, z26.s, z3.s\n" + "add z27.s, z27.s, z2.s\n" + "add z28.s, z28.s, z4.s\n" + "add z29.s, z29.s, z0.s\n" "ld1rw { z0.s }, p2/Z, [x20]\n" - "add z30.s, z30.s, z2.s\n" - "add z31.s, z31.s, z3.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" - ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" - ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" - ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" + "add z30.s, z30.s, z3.s\n" + "add z31.s, z31.s, z2.s\n" + ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" + ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" + ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" + ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" + ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" + ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" + ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" + ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" + ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" + ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" + ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" + ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" + ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" + ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" + ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" + ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" "tbz %x[flags], #5, 55f\n" - "and z4.d, z16.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z16.s, z16.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "and z8.d, z20.d, z0.d\n" - "and z9.d, z21.d, z0.d\n" - "and z10.d, z22.d, z0.d\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" + "and z2.d, z16.d, z0.d\n" + "and z1.d, z17.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z16.s, z16.s, z2.s\n" + "sqadd z17.s, z17.s, z1.s\n" + "and z7.d, z18.d, z0.d\n" + "and z6.d, z19.d, z0.d\n" + "and z5.d, z20.d, z0.d\n" + "and z4.d, z21.d, z0.d\n" + "and z3.d, z22.d, z0.d\n" + "and z2.d, z23.d, z0.d\n" + "and z1.d, z24.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" - "sqadd z20.s, z20.s, z8.s\n" - "sqadd z21.s, z21.s, z9.s\n" - "sqadd z22.s, z22.s, z10.s\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "and z9.d, z28.d, z0.d\n" - "and z10.d, z29.d, z0.d\n" - "and z4.d, z30.d, z0.d\n" - "and z5.d, z31.d, z0.d\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z18.s, z18.s, z7.s\n" + "sqadd z19.s, z19.s, z6.s\n" + "sqadd z20.s, z20.s, z5.s\n" + "sqadd z21.s, z21.s, z4.s\n" + "sqadd z22.s, z22.s, z3.s\n" + "sqadd z23.s, z23.s, z2.s\n" + "sqadd z24.s, z24.s, z1.s\n" + "and z7.d, z25.d, z0.d\n" + "and z6.d, z26.d, z0.d\n" + "and z5.d, z27.d, z0.d\n" + "and z4.d, z28.d, z0.d\n" + "and z3.d, z29.d, z0.d\n" + "and z2.d, z30.d, z0.d\n" + "and z1.d, z31.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" - "sqadd z28.s, z28.s, z9.s\n" - "sqadd z29.s, z29.s, z10.s\n" - "sqadd z30.s, z30.s, z4.s\n" - "sqadd z31.s, z31.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z25.s, z25.s, z7.s\n" + "sqadd z26.s, z26.s, z6.s\n" + "sqadd z27.s, z27.s, z5.s\n" + "sqadd z28.s, z28.s, z4.s\n" + "sqadd z29.s, z29.s, z3.s\n" + "sqadd z30.s, z30.s, z2.s\n" + "sqadd z31.s, z31.s, z1.s\n" "55:" // Height 4: no shift correction "add x20, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x20]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z16.s, z16.s, z4.s\n" + "add z16.s, z16.s, z2.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z20.s, z20.s, z4.s\n" + "add z19.s, z19.s, z2.s\n" + "add z20.s, z20.s, z2.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" - "add z21.s, z21.s, z4.s\n" - "add z22.s, z22.s, z4.s\n" + "add z21.s, z21.s, z2.s\n" + "add z22.s, z22.s, z2.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z23.s, z23.s, z2.s\n" + "add z24.s, z24.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" - "add z27.s, z27.s, z4.s\n" - "add z28.s, z28.s, z4.s\n" + "add z27.s, z27.s, z2.s\n" + "add z28.s, z28.s, z2.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" - "add z29.s, z29.s, z4.s\n" - "add z30.s, z30.s, z4.s\n" + "add z29.s, z29.s, z2.s\n" + "add z30.s, z30.s, z2.s\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" "add x20, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x20]\n" - "add z31.s, z31.s, z4.s\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add z31.s, z31.s, z2.s\n" "add x20, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x20]\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "smin z16.s, p2/M, z16.s, z1.s\n" + "smin z17.s, p2/M, z17.s, z1.s\n" + "smin z18.s, p2/M, z18.s, z1.s\n" + "smin z19.s, p2/M, z19.s, z1.s\n" + "smin z20.s, p2/M, z20.s, z1.s\n" + "smin z21.s, p2/M, z21.s, z1.s\n" + "smin z22.s, p2/M, z22.s, z1.s\n" + "smin z23.s, p2/M, z23.s, z1.s\n" + "smin z24.s, p2/M, z24.s, z1.s\n" + "smin z25.s, p2/M, z25.s, z1.s\n" + "smin z26.s, p2/M, z26.s, z1.s\n" + "smin z27.s, p2/M, z27.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z1.s\n" + "smin z29.s, p2/M, z29.s, z1.s\n" + "smin z30.s, p2/M, z30.s, z1.s\n" + "smin z31.s, p2/M, z31.s, z1.s\n" + "smax z16.s, p2/M, z16.s, z0.s\n" + "smax z17.s, p2/M, z17.s, z0.s\n" + "smax z18.s, p2/M, z18.s, z0.s\n" "uzp1 z16.h, z16.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z0.s\n" + "smax z20.s, p2/M, z20.s, z0.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z21.s, p2/M, z21.s, z5.s\n" - "smax z22.s, p2/M, z22.s, z5.s\n" + "smax z21.s, p2/M, z21.s, z0.s\n" + "smax z22.s, p2/M, z22.s, z0.s\n" "uzp1 z20.h, z20.h, z21.h\n" "st1b { z16.b }, p1, [x27]\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z21.h, z22.h, z23.h\n" - "uzp1 z20.b, z20.b, z21.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z23.s, p2/M, z23.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z0.s\n" + "uzp1 z16.h, z22.h, z23.h\n" + "uzp1 z20.b, z20.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z0.s\n" "uzp1 z24.h, z24.h, z25.h\n" "st1b { z20.b }, p1, [x23]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "smax z29.s, p2/M, z29.s, z5.s\n" - "smax z30.s, p2/M, z30.s, z5.s\n" + "smax z27.s, p2/M, z27.s, z0.s\n" + "smax z28.s, p2/M, z28.s, z0.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "smax z29.s, p2/M, z29.s, z0.s\n" + "smax z30.s, p2/M, z30.s, z0.s\n" "uzp1 z28.h, z28.h, z29.h\n" "st1b { z24.b }, p1, [x22]\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "uzp1 z29.h, z30.h, z31.h\n" - "uzp1 z28.b, z28.b, z29.b\n" + "smax z31.s, p2/M, z31.s, z0.s\n" + "uzp1 z16.h, z30.h, z31.h\n" + "uzp1 z28.b, z28.b, z16.b\n" "st1b { z28.b }, p1, [x21]\n" "addvl x27, x27, #1\n" "56:" // Height 4: Writeback done @@ -1491,7 +1491,6 @@ void sve_hybrid_u8qa_dot_4x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "58:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1499,4 +1498,4 @@ void sve_hybrid_u8qa_dot_4x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp index da27554a0f..5de68cc738 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -97,5 +96,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp index f9d38c2925..69894bec41 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp @@ -108,11 +108,11 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "4:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 5f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" "cbnz x26, 6f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -125,41 +125,41 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "7:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "trn1 z0.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn2 z1.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n" + ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n" + ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n" + ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n" + ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" + ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" + ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" + ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" "add x24, x24, #0x10\n" "tbnz %x[flags], #31, 8f\n" "udot z11.s, z0.b, z15.b\n" @@ -171,43 +171,43 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn1 z0.d, z1.d, z27.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z27.d\n" + ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n" + ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n" + ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n" + ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n" + ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n" "addvl x28, x28, #8\n" "ble 10f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n" + ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n" + ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" + ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" "addvl x28, x28, #8\n" "10:" // Height 1: Multiply loop: multiply skip "tbnz %x[flags], #31, 11f\n" @@ -224,74 +224,74 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp1 z19.d, z19.d, z23.d\n" "mov z23.d, z16.d\n" "tbnz %x[flags], #31, 12f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z1.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "neg z1.s, p2/M, z1.s\n" + "neg z16.s, p2/M, z16.s\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z1.s\n" + "mul z11.s, p2/M, z11.s, z16.s\n" "12:" // Height 1: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z17.s, z17.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x10]\n" + "ld1w { z21.s }, p2/Z, [x10, #1, MUL VL]\n" "add z18.s, z18.s, z11.s\n" "add z19.s, z19.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "ld1w { z20.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z16.s }, p2/Z, [x10, #3, MUL VL]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" - "add z23.s, z23.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" + "add z23.s, z23.s, z22.s\n" + "add z17.s, z17.s, z21.s\n" + "add z18.s, z18.s, z20.s\n" + "add z19.s, z19.s, z16.s\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n" + ".inst 0x04b07631 // sqrdmulh z17.s, z17.s, z16.s\n" "addvl x10, x10, #4\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n" + ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n" "tbz %x[flags], #5, 13f\n" - "and z4.d, z23.d, z0.d\n" - "and z5.d, z17.d, z0.d\n" - "and z6.d, z18.d, z0.d\n" - "and z7.d, z19.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "sqadd z17.s, z17.s, z5.s\n" - "sqadd z18.s, z18.s, z6.s\n" - "sqadd z19.s, z19.s, z7.s\n" + "and z22.d, z23.d, z0.d\n" + "and z21.d, z17.d, z0.d\n" + "and z20.d, z18.d, z0.d\n" + "and z16.d, z19.d, z0.d\n" + "asr z22.s, z22.s, #0x1f\n" + "asr z21.s, z21.s, #0x1f\n" + "asr z20.s, z20.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z23.s, z23.s, z22.s\n" + "sqadd z17.s, z17.s, z21.s\n" + "sqadd z18.s, z18.s, z20.s\n" + "sqadd z19.s, z19.s, z16.s\n" "13:" // Height 1: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z16.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z16.s\n" + "add z18.s, z18.s, z16.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z19.s, z19.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z20.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z16.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z16.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z20.s\n" + "smin z17.s, p2/M, z17.s, z20.s\n" + "smin z18.s, p2/M, z18.s, z20.s\n" + "smin z19.s, p2/M, z19.s, z20.s\n" + "smax z23.s, p2/M, z23.s, z16.s\n" + "smax z17.s, p2/M, z17.s, z16.s\n" + "smax z18.s, p2/M, z18.s, z16.s\n" "uzp1 z23.h, z23.h, z17.h\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "uzp1 z17.h, z18.h, z19.h\n" - "uzp1 z23.b, z23.b, z17.b\n" + "smax z19.s, p2/M, z19.s, z16.s\n" + "uzp1 z16.h, z18.h, z19.h\n" + "uzp1 z23.b, z23.b, z16.b\n" "st1b { z23.b }, p1, [x27]\n" "addvl x27, x27, #1\n" "14:" // Height 1: Writeback done @@ -324,12 +324,12 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "18:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 19f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" "cbnz x26, 20f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -337,49 +337,49 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "b 20f\n" "19:" // Height 2: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" + "add x23, x24, x21\n" "20:" // Height 2: input setup done "cmp x25, #0x10\n" "ble 23f\n" "21:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1rqb { z26.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #1, MUL VL]\n" + "trn2 z1.d, z1.d, z26.d\n" + "ld1b { z24.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45d99814 // ummla z20.s, z0.b, z25.b\n" + ".inst 0x45d89811 // ummla z17.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45da9815 // ummla z21.s, z0.b, z26.b\n" + ".inst 0x45d99812 // ummla z18.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + ".inst 0x45d89816 // ummla z22.s, z0.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45da9813 // ummla z19.s, z0.b, z26.b\n" + ".inst 0x45d99817 // ummla z23.s, z0.b, z25.b\n" + "ld1b { z26.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45da9834 // ummla z20.s, z1.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #-4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n" + ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #-2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45db9832 // ummla z18.s, z1.b, z27.b\n" + ".inst 0x45da9836 // ummla z22.s, z1.b, z26.b\n" + ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" + ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" "tbnz %x[flags], #31, 22f\n" @@ -392,44 +392,44 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "23:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x25\n" "ld1rqb { z1.b }, p0/Z, [x24]\n" - "ld1rqb { z2.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn1 z0.d, z1.d, z27.d\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89810 // ummla z16.s, z0.b, z24.b\n" + "ld1b { z26.b }, p2/Z, [x28, #1, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + "trn2 z1.d, z1.d, z27.d\n" + ".inst 0x45da9814 // ummla z20.s, z0.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z26.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45d99811 // ummla z17.s, z0.b, z25.b\n" + ".inst 0x45d89815 // ummla z21.s, z0.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45db9812 // ummla z18.s, z0.b, z27.b\n" + ".inst 0x45da9816 // ummla z22.s, z0.b, z26.b\n" + ".inst 0x45d99813 // ummla z19.s, z0.b, z25.b\n" + ".inst 0x45d89817 // ummla z23.s, z0.b, z24.b\n" "addvl x28, x28, #8\n" "ble 24f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" + "ld1b { z24.b }, p2/Z, [x28]\n" + ".inst 0x45d89830 // ummla z16.s, z1.b, z24.b\n" + "ld1b { z24.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45d89834 // ummla z20.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45d99831 // ummla z17.s, z1.b, z25.b\n" + ".inst 0x45d89835 // ummla z21.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45d99832 // ummla z18.s, z1.b, z25.b\n" + ".inst 0x45d89836 // ummla z22.s, z1.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z24.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45d99833 // ummla z19.s, z1.b, z25.b\n" + ".inst 0x45d89837 // ummla z23.s, z1.b, z24.b\n" "addvl x28, x28, #8\n" "24:" // Height 2: Multiply loop: multiply skip "tbnz %x[flags], #31, 25f\n" @@ -440,133 +440,133 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "add x26, x26, #0x1\n" "cmp x26, x20\n" "bne 18b\n" - "uzp1 z7.d, z16.d, z20.d\n" + "uzp1 z24.d, z16.d, z20.d\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "uzp2 z16.d, z16.d, z20.d\n" - "add x22, x27, x20\n" + "add x23, x27, x20\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" "uzp2 z19.d, z19.d, z23.d\n" - "mov z23.d, z7.d\n" + "mov z23.d, z24.d\n" "tbnz %x[flags], #31, 26f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z2.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" - "neg z2.s, p2/M, z2.s\n" + "neg z24.s, p2/M, z24.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z2.s\n" - "mul z12.s, p2/M, z12.s, z2.s\n" + "mul z11.s, p2/M, z11.s, z24.s\n" + "mul z12.s, p2/M, z12.s, z24.s\n" "26:" // Height 2: skip row sum fixup "add z23.s, z23.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10]\n" + "ld1w { z27.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z25.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" - "add z23.s, z23.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" + "add z23.s, z23.s, z28.s\n" + "add z20.s, z20.s, z27.s\n" "addvl x10, x10, #4\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" + "add z21.s, z21.s, z26.s\n" + "add z22.s, z22.s, z25.s\n" + "add z16.s, z16.s, z28.s\n" + "add z17.s, z17.s, z27.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z18.s, z18.s, z26.s\n" + "add z19.s, z19.s, z25.s\n" + ".inst 0x04b876f7 // sqrdmulh z23.s, z23.s, z24.s\n" + ".inst 0x04b87694 // sqrdmulh z20.s, z20.s, z24.s\n" + ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n" + ".inst 0x04b876d6 // sqrdmulh z22.s, z22.s, z24.s\n" + ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n" + ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n" + ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n" + ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n" "tbz %x[flags], #5, 27f\n" - "and z4.d, z23.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z23.s, z23.s, z4.s\n" - "and z5.d, z20.d, z0.d\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" + "and z24.d, z23.d, z0.d\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z23.s, z23.s, z24.s\n" + "and z30.d, z20.d, z0.d\n" + "and z29.d, z21.d, z0.d\n" + "and z28.d, z22.d, z0.d\n" + "and z27.d, z16.d, z0.d\n" + "and z26.d, z17.d, z0.d\n" + "and z25.d, z18.d, z0.d\n" + "and z24.d, z19.d, z0.d\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z27.s, z27.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z25.s, z25.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + "sqadd z20.s, z20.s, z30.s\n" + "sqadd z21.s, z21.s, z29.s\n" + "sqadd z22.s, z22.s, z28.s\n" + "sqadd z16.s, z16.s, z27.s\n" + "sqadd z17.s, z17.s, z26.s\n" + "sqadd z18.s, z18.s, z25.s\n" + "sqadd z19.s, z19.s, z24.s\n" "27:" // Height 2: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z23.s, z23.s, z4.s\n" + "add z23.s, z23.s, z24.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z24.s\n" + "add z21.s, z21.s, z24.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z24.s\n" + "add z16.s, z16.s, z24.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z24.s\n" + "add z18.s, z18.s, z24.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z19.s, z19.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z25.s }, p2/Z, [x20]\n" + "add z19.s, z19.s, z24.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z24.s }, p2/Z, [x20]\n" + "smin z23.s, p2/M, z23.s, z25.s\n" + "smin z20.s, p2/M, z20.s, z25.s\n" + "smin z21.s, p2/M, z21.s, z25.s\n" + "smin z22.s, p2/M, z22.s, z25.s\n" + "smin z16.s, p2/M, z16.s, z25.s\n" + "smin z17.s, p2/M, z17.s, z25.s\n" + "smin z18.s, p2/M, z18.s, z25.s\n" + "smin z19.s, p2/M, z19.s, z25.s\n" + "smax z23.s, p2/M, z23.s, z24.s\n" + "smax z20.s, p2/M, z20.s, z24.s\n" + "smax z21.s, p2/M, z21.s, z24.s\n" "uzp1 z23.h, z23.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z24.s\n" + "smax z16.s, p2/M, z16.s, z24.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z23.b, z23.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z24.s\n" + "smax z18.s, p2/M, z18.s, z24.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z23.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z24.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "st1b { z16.b }, p1, [x22]\n" + "st1b { z16.b }, p1, [x23]\n" "addvl x27, x27, #1\n" "28:" // Height 2: Writeback done "decw x9, ALL, MUL #4\n" @@ -607,13 +607,13 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "32:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 33f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" "cbnz x26, 34f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -622,8 +622,8 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "b 34f\n" "33:" // Height 3: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "34:" // Height 3: input setup done "cmp x25, #0x10\n" "ble 37f\n" @@ -634,60 +634,60 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "ld1rqb { z3.b }, p0/Z, [x22]\n" "trn1 z0.d, z1.d, z2.d\n" "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn1 z2.d, z3.d, z5.d\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n" + ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45c99815 // ummla z21.s, z0.b, z9.b\n" + ".inst 0x45c9985d // ummla z29.s, z2.b, z9.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" + ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n" + ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n" + ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n" + ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" "add x23, x23, #0x10\n" ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" "add x22, x22, #0x10\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" + ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" + ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" + ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" + ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" + ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" + ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" + ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" + ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" + ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" - ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" + ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" "tbnz %x[flags], #31, 36f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" @@ -708,56 +708,56 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "trn1 z2.d, z3.d, z4.d\n" "trn2 z3.d, z3.d, z4.d\n" ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #2, MUL VL]\n" "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" "subs x25, x25, #0x8\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n" + ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45c99811 // ummla z17.s, z0.b, z9.b\n" + ".inst 0x45c99859 // ummla z25.s, z2.b, z9.b\n" ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" "addvl x28, x28, #8\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n" + ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n" + ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n" + ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n" + ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n" + ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n" + ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n" + ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n" "ble 38f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" + ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n" + ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n" + ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" + ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" + ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" + ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" - ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" + ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" "38:" // Height 3: Multiply loop: multiply skip "tbnz %x[flags], #31, 39f\n" "udot z11.s, z0.b, z15.b\n" @@ -770,12 +770,12 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "cmp x26, x20\n" "bne 32b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z16.d, z20.d\n" - "add x22, x27, x20\n" + "uzp1 z0.d, z16.d, z20.d\n" + "add x23, x27, x20\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" "uzp2 z17.d, z17.d, z21.d\n" - "add x21, x22, x20\n" + "add x22, x23, x20\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" "uzp1 z22.d, z19.d, z23.d\n" @@ -784,170 +784,170 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" "uzp1 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" + "mov z31.d, z0.d\n" "tbnz %x[flags], #31, 40f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z3.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "neg z3.s, p2/M, z3.s\n" + "neg z23.s, p2/M, z23.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z3.s\n" + "mul z11.s, p2/M, z11.s, z23.s\n" "mov z13.s, z13.s[0]\n" - "mul z12.s, p2/M, z12.s, z3.s\n" - "mul z13.s, p2/M, z13.s, z3.s\n" + "mul z12.s, p2/M, z12.s, z23.s\n" + "mul z13.s, p2/M, z13.s, z23.s\n" "40:" // Height 3: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z30.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z29.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z28.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add z24.s, z24.s, z13.s\n" "add z25.s, z25.s, z13.s\n" "addvl x10, x10, #4\n" "add z26.s, z26.s, z13.s\n" "add z27.s, z27.s, z13.s\n" "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" + "add z20.s, z20.s, z30.s\n" + "add z21.s, z21.s, z29.s\n" + "add z22.s, z22.s, z28.s\n" "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" + "add z17.s, z17.s, z30.s\n" + "add z18.s, z18.s, z29.s\n" + "add z19.s, z19.s, z28.s\n" "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z25.s, z25.s, z30.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z26.s, z26.s, z29.s\n" + "add z27.s, z27.s, z28.s\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + ".inst 0x04b77610 // sqrdmulh z16.s, z16.s, z23.s\n" + ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n" + ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + ".inst 0x04b77718 // sqrdmulh z24.s, z24.s, z23.s\n" + ".inst 0x04b77739 // sqrdmulh z25.s, z25.s, z23.s\n" + ".inst 0x04b7775a // sqrdmulh z26.s, z26.s, z23.s\n" + ".inst 0x04b7777b // sqrdmulh z27.s, z27.s, z23.s\n" "tbz %x[flags], #5, 41f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "and z5.d, z24.d, z0.d\n" - "and z6.d, z25.d, z0.d\n" - "and z7.d, z26.d, z0.d\n" - "and z8.d, z27.d, z0.d\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "sqadd z24.s, z24.s, z5.s\n" - "sqadd z25.s, z25.s, z6.s\n" - "sqadd z26.s, z26.s, z7.s\n" - "sqadd z27.s, z27.s, z8.s\n" + "and z1.d, z31.d, z0.d\n" + "and z30.d, z20.d, z0.d\n" + "and z29.d, z21.d, z0.d\n" + "and z28.d, z22.d, z0.d\n" + "and z23.d, z16.d, z0.d\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z31.s, z31.s, z1.s\n" + "sqadd z20.s, z20.s, z30.s\n" + "sqadd z21.s, z21.s, z29.s\n" + "sqadd z22.s, z22.s, z28.s\n" + "sqadd z16.s, z16.s, z23.s\n" + "and z3.d, z17.d, z0.d\n" + "and z2.d, z18.d, z0.d\n" + "and z1.d, z19.d, z0.d\n" + "and z30.d, z24.d, z0.d\n" + "and z29.d, z25.d, z0.d\n" + "and z28.d, z26.d, z0.d\n" + "and z23.d, z27.d, z0.d\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z29.s, z29.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z23.s, z23.s, #0x1f\n" + "sqadd z17.s, z17.s, z3.s\n" + "sqadd z18.s, z18.s, z2.s\n" + "sqadd z19.s, z19.s, z1.s\n" + "sqadd z24.s, z24.s, z30.s\n" + "sqadd z25.s, z25.s, z29.s\n" + "sqadd z26.s, z26.s, z28.s\n" + "sqadd z27.s, z27.s, z23.s\n" "41:" // Height 3: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add z31.s, z31.s, z4.s\n" + "add z31.s, z31.s, z23.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z23.s\n" + "add z21.s, z21.s, z23.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z23.s\n" + "add z16.s, z16.s, z23.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z23.s\n" + "add z18.s, z18.s, z23.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z19.s, z19.s, z23.s\n" + "add z24.s, z24.s, z23.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z23.s\n" + "add z26.s, z26.s, z23.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z27.s, z27.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z28.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z23.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z23.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z28.s\n" + "smin z20.s, p2/M, z20.s, z28.s\n" + "smin z21.s, p2/M, z21.s, z28.s\n" + "smin z22.s, p2/M, z22.s, z28.s\n" + "smin z16.s, p2/M, z16.s, z28.s\n" + "smin z17.s, p2/M, z17.s, z28.s\n" + "smin z18.s, p2/M, z18.s, z28.s\n" + "smin z19.s, p2/M, z19.s, z28.s\n" + "smin z24.s, p2/M, z24.s, z28.s\n" + "smin z25.s, p2/M, z25.s, z28.s\n" + "smin z26.s, p2/M, z26.s, z28.s\n" + "smin z27.s, p2/M, z27.s, z28.s\n" + "smax z31.s, p2/M, z31.s, z23.s\n" + "smax z20.s, p2/M, z20.s, z23.s\n" + "smax z21.s, p2/M, z21.s, z23.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z23.s\n" + "smax z16.s, p2/M, z16.s, z23.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z23.s\n" + "smax z18.s, p2/M, z18.s, z23.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z31.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z23.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "smax z25.s, p2/M, z25.s, z23.s\n" + "smax z26.s, p2/M, z26.s, z23.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x21]\n" + "st1b { z16.b }, p1, [x23]\n" + "smax z27.s, p2/M, z27.s, z23.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x22]\n" "addvl x27, x27, #1\n" "42:" // Height 3: Writeback done "decw x9, ALL, MUL #4\n" @@ -992,14 +992,14 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "46:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w25, [x20, x26, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 47f\n" - "ldr x21, [%x[input_ptr], x26, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x24, [x21, #0x0]\n" - "ldr x23, [x21, #0x8]\n" - "ldr x22, [x21, #0x10]\n" - "ldr x21, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x26, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x24, [x20, #0x0]\n" + "ldr x23, [x20, #0x8]\n" + "ldr x22, [x20, #0x10]\n" + "ldr x21, [x20, #0x18]\n" "cbnz x26, 48f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x24, x24, x20\n" @@ -1009,9 +1009,9 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "b 48f\n" "47:" // Height 4: setup direct input "mov x24, %x[input_ptr]\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "48:" // Height 4: input setup done "cmp x25, #0x10\n" "ble 51f\n" @@ -1021,63 +1021,63 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "ld1rqb { z2.b }, p0/Z, [x23]\n" "trn1 z0.d, z1.d, z2.d\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z4.b }, p0/Z, [x21]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" - ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" - "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" + "trn1 z2.d, z3.d, z5.d\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n" + ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #2, MUL VL]\n" + ".inst 0x45c49814 // ummla z20.s, z0.b, z4.b\n" + ".inst 0x45c4985c // ummla z28.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c59811 // ummla z17.s, z0.b, z5.b\n" + ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c49815 // ummla z21.s, z0.b, z4.b\n" + ".inst 0x45c4985d // ummla z29.s, z2.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" "addvl x28, x28, #16\n" - "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" - ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n" - ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n" - "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n" - ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-8, MUL VL]\n" + ".inst 0x45c89812 // ummla z18.s, z0.b, z8.b\n" + ".inst 0x45c8985a // ummla z26.s, z2.b, z8.b\n" + ".inst 0x45c79816 // ummla z22.s, z0.b, z7.b\n" + "ld1b { z10.b }, p2/Z, [x28, #-7, MUL VL]\n" + "ld1b { z9.b }, p2/Z, [x28, #-6, MUL VL]\n" + ".inst 0x45c7985e // ummla z30.s, z2.b, z7.b\n" + ".inst 0x45c69813 // ummla z19.s, z0.b, z6.b\n" + "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n" + ".inst 0x45c6985b // ummla z27.s, z2.b, z6.b\n" ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n" "add x24, x24, #0x10\n" ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" - "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" + ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [x28, #-1, MUL VL]\n" + ".inst 0x45ca9834 // ummla z20.s, z1.b, z10.b\n" "add x22, x22, #0x10\n" - ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" + ".inst 0x45ca987c // ummla z28.s, z3.b, z10.b\n" + ".inst 0x45c99831 // ummla z17.s, z1.b, z9.b\n" "add x21, x21, #0x10\n" - ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c99879 // ummla z25.s, z3.b, z9.b\n" + ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" + ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" + ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" + ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" + ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" - ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" + ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" "tbnz %x[flags], #31, 50f\n" "udot z11.s, z0.b, z15.b\n" "udot z13.s, z2.b, z15.b\n" @@ -1093,62 +1093,62 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "ld1rqb { z2.b }, p0/Z, [x23]\n" "trn1 z0.d, z1.d, z2.d\n" "ld1rqb { z3.b }, p0/Z, [x22]\n" - "ld1rqb { z4.b }, p0/Z, [x21]\n" + "ld1rqb { z5.b }, p0/Z, [x21]\n" "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z5.b }, p2/Z, [x28]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c59810 // ummla z16.s, z0.b, z5.b\n" - ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n" + "trn1 z2.d, z3.d, z5.d\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + "trn2 z3.d, z3.d, z5.d\n" + ".inst 0x45c49810 // ummla z16.s, z0.b, z4.b\n" + ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" "subs x25, x25, #0x8\n" - ".inst 0x45c69814 // ummla z20.s, z0.b, z6.b\n" + ".inst 0x45c59814 // ummla z20.s, z0.b, z5.b\n" "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n" - ".inst 0x45c6985c // ummla z28.s, z2.b, z6.b\n" - ".inst 0x45c79811 // ummla z17.s, z0.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + ".inst 0x45c5985c // ummla z28.s, z2.b, z5.b\n" + ".inst 0x45c49811 // ummla z17.s, z0.b, z4.b\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" + ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n" ".inst 0x45c89815 // ummla z21.s, z0.b, z8.b\n" - "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" ".inst 0x45c8985d // ummla z29.s, z2.b, z8.b\n" - ".inst 0x45c99812 // ummla z18.s, z0.b, z9.b\n" + ".inst 0x45c79812 // ummla z18.s, z0.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x45c9985a // ummla z26.s, z2.b, z9.b\n" - ".inst 0x45ca9816 // ummla z22.s, z0.b, z10.b\n" - ".inst 0x45ca985e // ummla z30.s, z2.b, z10.b\n" - ".inst 0x45c49813 // ummla z19.s, z0.b, z4.b\n" - ".inst 0x45c4985b // ummla z27.s, z2.b, z4.b\n" - ".inst 0x45c59817 // ummla z23.s, z0.b, z5.b\n" - ".inst 0x45c5985f // ummla z31.s, z2.b, z5.b\n" + ".inst 0x45c7985a // ummla z26.s, z2.b, z7.b\n" + ".inst 0x45c69816 // ummla z22.s, z0.b, z6.b\n" + ".inst 0x45c6985e // ummla z30.s, z2.b, z6.b\n" + ".inst 0x45c59813 // ummla z19.s, z0.b, z5.b\n" + ".inst 0x45c5985b // ummla z27.s, z2.b, z5.b\n" + ".inst 0x45c49817 // ummla z23.s, z0.b, z4.b\n" + ".inst 0x45c4985f // ummla z31.s, z2.b, z4.b\n" "ble 52f\n" - "ld1b { z6.b }, p2/Z, [x28]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - ".inst 0x45c69878 // ummla z24.s, z3.b, z6.b\n" - "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n" - "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n" - "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n" - ".inst 0x45c79834 // ummla z20.s, z1.b, z7.b\n" - ".inst 0x45c7987c // ummla z28.s, z3.b, z7.b\n" - "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n" - "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n" - ".inst 0x45c89831 // ummla z17.s, z1.b, z8.b\n" - ".inst 0x45c89879 // ummla z25.s, z3.b, z8.b\n" + "ld1b { z4.b }, p2/Z, [x28]\n" + ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" + ".inst 0x45c49878 // ummla z24.s, z3.b, z4.b\n" + "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n" + "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n" + "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n" + ".inst 0x45c59834 // ummla z20.s, z1.b, z5.b\n" + ".inst 0x45c5987c // ummla z28.s, z3.b, z5.b\n" + "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n" + "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n" + ".inst 0x45c49831 // ummla z17.s, z1.b, z4.b\n" + ".inst 0x45c49879 // ummla z25.s, z3.b, z4.b\n" "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n" - "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n" - ".inst 0x45c99835 // ummla z21.s, z1.b, z9.b\n" - ".inst 0x45c9987d // ummla z29.s, z3.b, z9.b\n" - ".inst 0x45ca9832 // ummla z18.s, z1.b, z10.b\n" - ".inst 0x45ca987a // ummla z26.s, z3.b, z10.b\n" + "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n" + ".inst 0x45c89835 // ummla z21.s, z1.b, z8.b\n" + ".inst 0x45c8987d // ummla z29.s, z3.b, z8.b\n" + ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + ".inst 0x45c7987a // ummla z26.s, z3.b, z7.b\n" "addvl x28, x28, #8\n" - ".inst 0x45c49836 // ummla z22.s, z1.b, z4.b\n" - ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n" + ".inst 0x45c69836 // ummla z22.s, z1.b, z6.b\n" + ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n" ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" - ".inst 0x45c69837 // ummla z23.s, z1.b, z6.b\n" - ".inst 0x45c6987f // ummla z31.s, z3.b, z6.b\n" + ".inst 0x45c49837 // ummla z23.s, z1.b, z4.b\n" + ".inst 0x45c4987f // ummla z31.s, z3.b, z4.b\n" "52:" // Height 4: Multiply loop: multiply skip "tbnz %x[flags], #31, 53f\n" "udot z11.s, z0.b, z15.b\n" @@ -1161,12 +1161,12 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "cmp x26, x20\n" "bne 46b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 z7.d, z16.d, z20.d\n" - "add x22, x27, x20\n" - "add x21, x22, x20\n" + "uzp1 z0.d, z16.d, z20.d\n" + "add x23, x27, x20\n" + "add x22, x23, x20\n" "uzp2 z16.d, z16.d, z20.d\n" "uzp1 z20.d, z17.d, z21.d\n" - "add x20, x21, x20\n" + "add x21, x22, x20\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" "uzp2 z18.d, z18.d, z22.d\n" @@ -1180,38 +1180,38 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "uzp2 z26.d, z26.d, z30.d\n" "uzp1 z30.d, z27.d, z31.d\n" "uzp2 z27.d, z27.d, z31.d\n" - "mov z31.d, z7.d\n" + "mov z31.d, z0.d\n" "tbnz %x[flags], #31, 54f\n" - "add x23, %x[qp], %[b_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[b_offset]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" ".inst 0x4491a96b // addp z11.s, p2/m, z11.s, z11.s\n" ".inst 0x4491a9ad // addp z13.s, p2/m, z13.s, z13.s\n" - "neg z4.s, p2/M, z4.s\n" + "neg z0.s, p2/M, z0.s\n" "mov z12.s, z11.s[3]\n" "mov z11.s, z11.s[0]\n" - "mul z11.s, p2/M, z11.s, z4.s\n" + "mul z11.s, p2/M, z11.s, z0.s\n" "mov z14.s, z13.s[3]\n" "mov z13.s, z13.s[0]\n" - "mul z12.s, p2/M, z12.s, z4.s\n" - "mul z13.s, p2/M, z13.s, z4.s\n" - "mul z14.s, p2/M, z14.s, z4.s\n" + "mul z12.s, p2/M, z12.s, z0.s\n" + "mul z13.s, p2/M, z13.s, z0.s\n" + "mul z14.s, p2/M, z14.s, z0.s\n" "54:" // Height 4: skip row sum fixup "add z31.s, z31.s, z11.s\n" "add z20.s, z20.s, z11.s\n" - "ld1w { z0.s }, p2/Z, [x10]\n" - "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [x10]\n" + "ld1w { z0.s }, p2/Z, [x10, #1, MUL VL]\n" "add z21.s, z21.s, z11.s\n" "add z22.s, z22.s, z11.s\n" - "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n" - "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n" + "ld1w { z3.s }, p2/Z, [x10, #2, MUL VL]\n" + "ld1w { z2.s }, p2/Z, [x10, #3, MUL VL]\n" "add z16.s, z16.s, z12.s\n" "add z17.s, z17.s, z12.s\n" - "add x23, %x[qp], %[per_layer_mul]\n" + "add x20, %x[qp], %[per_layer_mul]\n" "orr %x[flags], %x[flags], #0x80000000\n" "add z18.s, z18.s, z12.s\n" "add z19.s, z19.s, z12.s\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" - "add x23, %x[qp], %[per_layer_right_shift]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add x20, %x[qp], %[per_layer_right_shift]\n" "add z23.s, z23.s, z13.s\n" "add z28.s, z28.s, z13.s\n" "addvl x10, x10, #4\n" @@ -1221,175 +1221,175 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "add z25.s, z25.s, z14.s\n" "add z26.s, z26.s, z14.s\n" "add z27.s, z27.s, z14.s\n" - "add z31.s, z31.s, z0.s\n" - "add z20.s, z20.s, z1.s\n" - "add z21.s, z21.s, z2.s\n" - "add z22.s, z22.s, z3.s\n" - "add z16.s, z16.s, z0.s\n" - "add z17.s, z17.s, z1.s\n" - "add z18.s, z18.s, z2.s\n" - "add z19.s, z19.s, z3.s\n" - "add z23.s, z23.s, z0.s\n" - "add z28.s, z28.s, z1.s\n" - "add z29.s, z29.s, z2.s\n" - "add z30.s, z30.s, z3.s\n" - "add z24.s, z24.s, z0.s\n" - "add z25.s, z25.s, z1.s\n" - "ld1rw { z0.s }, p2/Z, [x23]\n" - "add z26.s, z26.s, z2.s\n" - "add z27.s, z27.s, z3.s\n" - ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n" - ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n" - ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n" - ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n" - ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n" - ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n" - ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n" - ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n" - ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n" - ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n" - ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n" - ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n" - ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n" - ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n" - ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n" - ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "add z20.s, z20.s, z0.s\n" + "add z21.s, z21.s, z3.s\n" + "add z22.s, z22.s, z2.s\n" + "add z16.s, z16.s, z4.s\n" + "add z17.s, z17.s, z0.s\n" + "add z18.s, z18.s, z3.s\n" + "add z19.s, z19.s, z2.s\n" + "add z23.s, z23.s, z4.s\n" + "add z28.s, z28.s, z0.s\n" + "add z29.s, z29.s, z3.s\n" + "add z30.s, z30.s, z2.s\n" + "add z24.s, z24.s, z4.s\n" + "add z25.s, z25.s, z0.s\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "add z26.s, z26.s, z3.s\n" + "add z27.s, z27.s, z2.s\n" + ".inst 0x04a177ff // sqrdmulh z31.s, z31.s, z1.s\n" + ".inst 0x04a17694 // sqrdmulh z20.s, z20.s, z1.s\n" + ".inst 0x04a176b5 // sqrdmulh z21.s, z21.s, z1.s\n" + ".inst 0x04a176d6 // sqrdmulh z22.s, z22.s, z1.s\n" + ".inst 0x04a17610 // sqrdmulh z16.s, z16.s, z1.s\n" + ".inst 0x04a17631 // sqrdmulh z17.s, z17.s, z1.s\n" + ".inst 0x04a17652 // sqrdmulh z18.s, z18.s, z1.s\n" + ".inst 0x04a17673 // sqrdmulh z19.s, z19.s, z1.s\n" + ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n" + ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n" + ".inst 0x04a177bd // sqrdmulh z29.s, z29.s, z1.s\n" + ".inst 0x04a177de // sqrdmulh z30.s, z30.s, z1.s\n" + ".inst 0x04a17718 // sqrdmulh z24.s, z24.s, z1.s\n" + ".inst 0x04a17739 // sqrdmulh z25.s, z25.s, z1.s\n" + ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n" + ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n" "tbz %x[flags], #5, 55f\n" - "and z4.d, z31.d, z0.d\n" - "and z5.d, z20.d, z0.d\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z31.s, z31.s, z4.s\n" - "sqadd z20.s, z20.s, z5.s\n" - "and z6.d, z21.d, z0.d\n" - "and z7.d, z22.d, z0.d\n" - "and z8.d, z16.d, z0.d\n" - "and z9.d, z17.d, z0.d\n" - "and z10.d, z18.d, z0.d\n" - "and z4.d, z19.d, z0.d\n" - "and z5.d, z23.d, z0.d\n" - "asr z6.s, z6.s, #0x1f\n" + "and z2.d, z31.d, z0.d\n" + "and z1.d, z20.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z31.s, z31.s, z2.s\n" + "sqadd z20.s, z20.s, z1.s\n" + "and z7.d, z21.d, z0.d\n" + "and z6.d, z22.d, z0.d\n" + "and z5.d, z16.d, z0.d\n" + "and z4.d, z17.d, z0.d\n" + "and z3.d, z18.d, z0.d\n" + "and z2.d, z19.d, z0.d\n" + "and z1.d, z23.d, z0.d\n" "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" - "asr z4.s, z4.s, #0x1f\n" - "asr z5.s, z5.s, #0x1f\n" - "sqadd z21.s, z21.s, z6.s\n" - "sqadd z22.s, z22.s, z7.s\n" - "sqadd z16.s, z16.s, z8.s\n" - "sqadd z17.s, z17.s, z9.s\n" - "sqadd z18.s, z18.s, z10.s\n" - "sqadd z19.s, z19.s, z4.s\n" - "sqadd z23.s, z23.s, z5.s\n" - "and z6.d, z28.d, z0.d\n" - "and z7.d, z29.d, z0.d\n" - "and z8.d, z30.d, z0.d\n" - "and z9.d, z24.d, z0.d\n" - "and z10.d, z25.d, z0.d\n" - "and z4.d, z26.d, z0.d\n" - "and z5.d, z27.d, z0.d\n" "asr z6.s, z6.s, #0x1f\n" - "asr z7.s, z7.s, #0x1f\n" - "asr z8.s, z8.s, #0x1f\n" - "asr z9.s, z9.s, #0x1f\n" - "asr z10.s, z10.s, #0x1f\n" + "asr z5.s, z5.s, #0x1f\n" "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z21.s, z21.s, z7.s\n" + "sqadd z22.s, z22.s, z6.s\n" + "sqadd z16.s, z16.s, z5.s\n" + "sqadd z17.s, z17.s, z4.s\n" + "sqadd z18.s, z18.s, z3.s\n" + "sqadd z19.s, z19.s, z2.s\n" + "sqadd z23.s, z23.s, z1.s\n" + "and z7.d, z28.d, z0.d\n" + "and z6.d, z29.d, z0.d\n" + "and z5.d, z30.d, z0.d\n" + "and z4.d, z24.d, z0.d\n" + "and z3.d, z25.d, z0.d\n" + "and z2.d, z26.d, z0.d\n" + "and z1.d, z27.d, z0.d\n" + "asr z7.s, z7.s, #0x1f\n" + "asr z6.s, z6.s, #0x1f\n" "asr z5.s, z5.s, #0x1f\n" - "sqadd z28.s, z28.s, z6.s\n" - "sqadd z29.s, z29.s, z7.s\n" - "sqadd z30.s, z30.s, z8.s\n" - "sqadd z24.s, z24.s, z9.s\n" - "sqadd z25.s, z25.s, z10.s\n" - "sqadd z26.s, z26.s, z4.s\n" - "sqadd z27.s, z27.s, z5.s\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z3.s, z3.s, #0x1f\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z1.s, z1.s, #0x1f\n" + "sqadd z28.s, z28.s, z7.s\n" + "sqadd z29.s, z29.s, z6.s\n" + "sqadd z30.s, z30.s, z5.s\n" + "sqadd z24.s, z24.s, z4.s\n" + "sqadd z25.s, z25.s, z3.s\n" + "sqadd z26.s, z26.s, z2.s\n" + "sqadd z27.s, z27.s, z1.s\n" "55:" // Height 4: no shift correction - "add x23, %x[qp], %[c_offset]\n" - "ld1rw { z4.s }, p2/Z, [x23]\n" + "add x20, %x[qp], %[c_offset]\n" + "ld1rw { z2.s }, p2/Z, [x20]\n" ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n" - "add z31.s, z31.s, z4.s\n" + "add z31.s, z31.s, z2.s\n" ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n" ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n" - "add z20.s, z20.s, z4.s\n" - "add z21.s, z21.s, z4.s\n" + "add z20.s, z20.s, z2.s\n" + "add z21.s, z21.s, z2.s\n" ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n" ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n" - "add z22.s, z22.s, z4.s\n" - "add z16.s, z16.s, z4.s\n" + "add z22.s, z22.s, z2.s\n" + "add z16.s, z16.s, z2.s\n" ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n" ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n" - "add z17.s, z17.s, z4.s\n" - "add z18.s, z18.s, z4.s\n" + "add z17.s, z17.s, z2.s\n" + "add z18.s, z18.s, z2.s\n" ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n" ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n" - "add z19.s, z19.s, z4.s\n" - "add z23.s, z23.s, z4.s\n" + "add z19.s, z19.s, z2.s\n" + "add z23.s, z23.s, z2.s\n" ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n" ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n" - "add z28.s, z28.s, z4.s\n" - "add z29.s, z29.s, z4.s\n" + "add z28.s, z28.s, z2.s\n" + "add z29.s, z29.s, z2.s\n" ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n" ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n" - "add z30.s, z30.s, z4.s\n" - "add z24.s, z24.s, z4.s\n" + "add z30.s, z30.s, z2.s\n" + "add z24.s, z24.s, z2.s\n" ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n" ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n" - "add z25.s, z25.s, z4.s\n" - "add z26.s, z26.s, z4.s\n" + "add z25.s, z25.s, z2.s\n" + "add z26.s, z26.s, z2.s\n" ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n" - "add x23, %x[qp], %[maxval]\n" - "ld1rw { z6.s }, p2/Z, [x23]\n" - "add z27.s, z27.s, z4.s\n" - "add x23, %x[qp], %[minval]\n" - "ld1rw { z5.s }, p2/Z, [x23]\n" - "smin z31.s, p2/M, z31.s, z6.s\n" - "smin z20.s, p2/M, z20.s, z6.s\n" - "smin z21.s, p2/M, z21.s, z6.s\n" - "smin z22.s, p2/M, z22.s, z6.s\n" - "smin z16.s, p2/M, z16.s, z6.s\n" - "smin z17.s, p2/M, z17.s, z6.s\n" - "smin z18.s, p2/M, z18.s, z6.s\n" - "smin z19.s, p2/M, z19.s, z6.s\n" - "smin z23.s, p2/M, z23.s, z6.s\n" - "smin z28.s, p2/M, z28.s, z6.s\n" - "smin z29.s, p2/M, z29.s, z6.s\n" - "smin z30.s, p2/M, z30.s, z6.s\n" - "smin z24.s, p2/M, z24.s, z6.s\n" - "smin z25.s, p2/M, z25.s, z6.s\n" - "smin z26.s, p2/M, z26.s, z6.s\n" - "smin z27.s, p2/M, z27.s, z6.s\n" - "smax z31.s, p2/M, z31.s, z5.s\n" - "smax z20.s, p2/M, z20.s, z5.s\n" - "smax z21.s, p2/M, z21.s, z5.s\n" + "add x20, %x[qp], %[maxval]\n" + "ld1rw { z1.s }, p2/Z, [x20]\n" + "add z27.s, z27.s, z2.s\n" + "add x20, %x[qp], %[minval]\n" + "ld1rw { z0.s }, p2/Z, [x20]\n" + "smin z31.s, p2/M, z31.s, z1.s\n" + "smin z20.s, p2/M, z20.s, z1.s\n" + "smin z21.s, p2/M, z21.s, z1.s\n" + "smin z22.s, p2/M, z22.s, z1.s\n" + "smin z16.s, p2/M, z16.s, z1.s\n" + "smin z17.s, p2/M, z17.s, z1.s\n" + "smin z18.s, p2/M, z18.s, z1.s\n" + "smin z19.s, p2/M, z19.s, z1.s\n" + "smin z23.s, p2/M, z23.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z1.s\n" + "smin z29.s, p2/M, z29.s, z1.s\n" + "smin z30.s, p2/M, z30.s, z1.s\n" + "smin z24.s, p2/M, z24.s, z1.s\n" + "smin z25.s, p2/M, z25.s, z1.s\n" + "smin z26.s, p2/M, z26.s, z1.s\n" + "smin z27.s, p2/M, z27.s, z1.s\n" + "smax z31.s, p2/M, z31.s, z0.s\n" + "smax z20.s, p2/M, z20.s, z0.s\n" + "smax z21.s, p2/M, z21.s, z0.s\n" "uzp1 z31.h, z31.h, z20.h\n" - "smax z22.s, p2/M, z22.s, z5.s\n" - "smax z16.s, p2/M, z16.s, z5.s\n" + "smax z22.s, p2/M, z22.s, z0.s\n" + "smax z16.s, p2/M, z16.s, z0.s\n" "uzp1 z20.h, z21.h, z22.h\n" "uzp1 z31.b, z31.b, z20.b\n" - "smax z17.s, p2/M, z17.s, z5.s\n" - "smax z18.s, p2/M, z18.s, z5.s\n" + "smax z17.s, p2/M, z17.s, z0.s\n" + "smax z18.s, p2/M, z18.s, z0.s\n" "uzp1 z16.h, z16.h, z17.h\n" "st1b { z31.b }, p1, [x27]\n" - "smax z19.s, p2/M, z19.s, z5.s\n" - "smax z23.s, p2/M, z23.s, z5.s\n" + "smax z19.s, p2/M, z19.s, z0.s\n" + "smax z23.s, p2/M, z23.s, z0.s\n" "uzp1 z17.h, z18.h, z19.h\n" "uzp1 z16.b, z16.b, z17.b\n" - "smax z28.s, p2/M, z28.s, z5.s\n" - "smax z29.s, p2/M, z29.s, z5.s\n" + "smax z28.s, p2/M, z28.s, z0.s\n" + "smax z29.s, p2/M, z29.s, z0.s\n" "uzp1 z23.h, z23.h, z28.h\n" - "st1b { z16.b }, p1, [x22]\n" - "smax z30.s, p2/M, z30.s, z5.s\n" - "smax z24.s, p2/M, z24.s, z5.s\n" - "uzp1 z28.h, z29.h, z30.h\n" - "uzp1 z23.b, z23.b, z28.b\n" - "smax z25.s, p2/M, z25.s, z5.s\n" - "smax z26.s, p2/M, z26.s, z5.s\n" + "st1b { z16.b }, p1, [x23]\n" + "smax z30.s, p2/M, z30.s, z0.s\n" + "smax z24.s, p2/M, z24.s, z0.s\n" + "uzp1 z16.h, z29.h, z30.h\n" + "uzp1 z23.b, z23.b, z16.b\n" + "smax z25.s, p2/M, z25.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z0.s\n" "uzp1 z24.h, z24.h, z25.h\n" - "st1b { z23.b }, p1, [x21]\n" - "smax z27.s, p2/M, z27.s, z5.s\n" - "uzp1 z25.h, z26.h, z27.h\n" - "uzp1 z24.b, z24.b, z25.b\n" - "st1b { z24.b }, p1, [x20]\n" + "st1b { z23.b }, p1, [x22]\n" + "smax z27.s, p2/M, z27.s, z0.s\n" + "uzp1 z16.h, z26.h, z27.h\n" + "uzp1 z24.b, z24.b, z16.b\n" + "st1b { z24.b }, p1, [x21]\n" "addvl x27, x27, #1\n" "56:" // Height 4: Writeback done "decw x9, ALL, MUL #4\n" @@ -1407,7 +1407,6 @@ void sve_hybrid_u8qa_mmla_4x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "58:" // Exit - : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1415,4 +1414,4 @@ void sve_hybrid_u8qa_mmla_4x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp index 901cc6d63e..e9197e8ec5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -39,6 +39,7 @@ namespace arm_gemm { // Actual kernel implementations void sve_hybrid_u8u32_dot_6x4VL( ARGLIST ); +void sve_hybrid_u8u32_dot_6x4VL_a64fx( ARGLIST ); class cls_sve_hybrid_u8u32_dot_6x4VL { @@ -74,7 +75,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -83,10 +83,11 @@ public: return { 20.98 }; case CPUModel::V1: return { 62.19 }; + case CPUModel::A64FX: + return { 91.23 }; } } - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -95,6 +96,8 @@ public: return { 22.75, 3.90, 0.47 }; case CPUModel::V1: return { 48.09, 16.24, 0.83 }; + case CPUModel::A64FX: + return { 101.62, 3.15, 0.42 }; } } @@ -103,13 +106,19 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_u8u32_dot_6x4VL; - cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *) + cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_hybrid_u8u32_dot_6x4VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp index a7dbef329e..4d0f44982a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp @@ -115,11 +115,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -135,12 +135,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "8:" // Height 1: Multiply loop: Main loop "udot z8.s, z6.b, z0.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x26, x26, #0x4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z11.s, z7.b, z0.b\n" + "udot z10.s, z17.b, z0.b\n" + "udot z11.s, z16.b, z0.b\n" "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" @@ -150,12 +150,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "udot z8.s, z6.b, z0.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "udot z10.s, z6.b, z0.b\n" - "udot z11.s, z7.b, z0.b\n" + "udot z10.s, z17.b, z0.b\n" + "udot z11.s, z16.b, z0.b\n" "addvl x10, x10, #4\n" "bne 5b\n" "st1w { z8.s }, p3, [x9]\n" @@ -183,15 +183,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 13f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x20]\n" + "ld1w { z13.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x20, #3, MUL VL]\n" "b 14f\n" "13:" // Height 2: no accumulate "mov z8.s, #0x0\n" @@ -207,12 +207,12 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "15:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 16f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 17f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -220,7 +220,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "b 17f\n" "16:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "17:" // Height 2: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -231,18 +231,18 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "18:" // Height 2: Multiply loop: Main loop "udot z8.s, z6.b, z0.b\n" "udot z12.s, z6.b, z1.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" "add x26, x26, #0x4\n" "udot z9.s, z7.b, z0.b\n" "udot z13.s, z7.b, z1.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "subs x27, x27, #0x4\n" "add x25, x25, #0x4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" + "udot z10.s, z17.b, z0.b\n" + "udot z14.s, z17.b, z1.b\n" + "udot z11.s, z16.b, z0.b\n" + "udot z15.s, z16.b, z1.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1b { z6.b }, p4/Z, [x10]\n" @@ -252,29 +252,29 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "udot z8.s, z6.b, z0.b\n" "udot z12.s, z6.b, z1.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z17.b }, p4/Z, [x10, #2, MUL VL]\n" "udot z9.s, z7.b, z0.b\n" "udot z13.s, z7.b, z1.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z16.b }, p4/Z, [x10, #3, MUL VL]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" + "udot z10.s, z17.b, z0.b\n" + "udot z14.s, z17.b, z1.b\n" "addvl x10, x10, #4\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" + "udot z11.s, z16.b, z0.b\n" + "udot z15.s, z16.b, z1.b\n" "bne 15b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x20]\n" + "st1w { z13.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x20, #3, MUL VL]\n" "20:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -295,20 +295,20 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 23f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x21]\n" + "ld1w { z13.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x20]\n" + "ld1w { z17.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x20, #3, MUL VL]\n" "b 24f\n" "23:" // Height 3: no accumulate "mov z8.s, #0x0\n" @@ -328,13 +328,13 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "25:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 26f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 27f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -343,8 +343,8 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "b 27f\n" "26:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "27:" // Height 3: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -360,21 +360,21 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "subs x27, x27, #0x4\n" "udot z16.s, z6.b, z2.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "udot z13.s, z7.b, z1.b\n" "udot z17.s, z7.b, z2.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" "add x24, x24, #0x4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z11.s, z7.b, z0.b\n" + "udot z10.s, z21.b, z0.b\n" + "udot z14.s, z21.b, z1.b\n" + "udot z18.s, z21.b, z2.b\n" + "udot z11.s, z20.b, z0.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "udot z15.s, z7.b, z1.b\n" - "udot z19.s, z7.b, z2.b\n" + "udot z15.s, z20.b, z1.b\n" + "udot z19.s, z20.b, z2.b\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -386,35 +386,35 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "add x28, x28, #0x1\n" "udot z16.s, z6.b, z2.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "udot z13.s, z7.b, z1.b\n" "udot z17.s, z7.b, z2.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z20.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" - "udot z19.s, z7.b, z2.b\n" + "udot z10.s, z21.b, z0.b\n" + "udot z14.s, z21.b, z1.b\n" + "udot z18.s, z21.b, z2.b\n" + "udot z11.s, z20.b, z0.b\n" + "udot z15.s, z20.b, z1.b\n" + "udot z19.s, z20.b, z2.b\n" "bne 25b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x21]\n" + "st1w { z13.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x20]\n" + "st1w { z17.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x20, #3, MUL VL]\n" "30:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -435,25 +435,25 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 33f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p3/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x22]\n" - "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x22]\n" + "ld1w { z13.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x21]\n" + "ld1w { z17.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x20]\n" + "ld1w { z21.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x20, #3, MUL VL]\n" "b 34f\n" "33:" // Height 4: no accumulate "mov z8.s, #0x0\n" @@ -477,14 +477,14 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "35:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 36f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 37f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -494,9 +494,9 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "b 37f\n" "36:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "37:" // Height 4: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -513,7 +513,7 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "subs x27, x27, #0x4\n" "udot z16.s, z6.b, z2.b\n" "udot z20.s, z6.b, z3.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x4\n" "udot z9.s, z7.b, z0.b\n" "udot z13.s, z7.b, z1.b\n" @@ -521,19 +521,19 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "add x23, x23, #0x4\n" "udot z17.s, z7.b, z2.b\n" "udot z21.s, z7.b, z3.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z22.s, z6.b, z3.b\n" + "udot z10.s, z25.b, z0.b\n" + "udot z14.s, z25.b, z1.b\n" + "udot z18.s, z25.b, z2.b\n" + "udot z22.s, z25.b, z3.b\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" + "udot z11.s, z24.b, z0.b\n" + "udot z15.s, z24.b, z1.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1rw { z1.s }, p4/Z, [x25]\n" - "udot z19.s, z7.b, z2.b\n" - "udot z23.s, z7.b, z3.b\n" + "udot z19.s, z24.b, z2.b\n" + "udot z23.s, z24.b, z3.b\n" "ld1rw { z2.s }, p4/Z, [x24]\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -545,44 +545,44 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "add x28, x28, #0x1\n" "udot z16.s, z6.b, z2.b\n" "udot z20.s, z6.b, z3.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p4/Z, [x10, #2, MUL VL]\n" "cmp x28, x20\n" "udot z9.s, z7.b, z0.b\n" "udot z13.s, z7.b, z1.b\n" "udot z17.s, z7.b, z2.b\n" "udot z21.s, z7.b, z3.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z24.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z22.s, z6.b, z3.b\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" - "udot z19.s, z7.b, z2.b\n" - "udot z23.s, z7.b, z3.b\n" + "udot z10.s, z25.b, z0.b\n" + "udot z14.s, z25.b, z1.b\n" + "udot z18.s, z25.b, z2.b\n" + "udot z22.s, z25.b, z3.b\n" + "udot z11.s, z24.b, z0.b\n" + "udot z15.s, z24.b, z1.b\n" + "udot z19.s, z24.b, z2.b\n" + "udot z23.s, z24.b, z3.b\n" "bne 35b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "st1w { z8.s }, p3, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p3, [x22]\n" - "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x22]\n" + "st1w { z13.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x21]\n" + "st1w { z17.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x20]\n" + "st1w { z21.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x20, #3, MUL VL]\n" "40:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -603,30 +603,30 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "whilelt p0.s, x20, x11\n" "tbz %x[flags], #0, 43f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z8.s }, p3/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p3/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p2/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p1/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p0/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p3/Z, [x24]\n" - "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p3/Z, [x23]\n" - "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p3/Z, [x22]\n" - "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n" - "ld1w { z24.s }, p3/Z, [x21]\n" - "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n" - "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n" - "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p3/Z, [x23]\n" + "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p3/Z, [x22]\n" + "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p3/Z, [x21]\n" + "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p3/Z, [x20]\n" + "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n" "b 44f\n" "43:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -654,15 +654,15 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "45:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 46f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -673,10 +673,10 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "b 47f\n" "46:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "47:" // Height 5: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -698,29 +698,29 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "add x24, x24, #0x4\n" "udot z24.s, z6.b, z4.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n" "add x23, x23, #0x4\n" "udot z13.s, z7.b, z1.b\n" "udot z17.s, z7.b, z2.b\n" "add x22, x22, #0x4\n" "udot z21.s, z7.b, z3.b\n" "udot z25.s, z7.b, z4.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z22.s, z6.b, z3.b\n" - "udot z26.s, z6.b, z4.b\n" - "udot z11.s, z7.b, z0.b\n" + "udot z10.s, z29.b, z0.b\n" + "udot z14.s, z29.b, z1.b\n" + "udot z18.s, z29.b, z2.b\n" + "udot z22.s, z29.b, z3.b\n" + "udot z26.s, z29.b, z4.b\n" + "udot z11.s, z28.b, z0.b\n" "ld1rw { z0.s }, p4/Z, [x26]\n" "ld1b { z6.b }, p4/Z, [x10]\n" - "udot z15.s, z7.b, z1.b\n" - "udot z19.s, z7.b, z2.b\n" + "udot z15.s, z28.b, z1.b\n" + "udot z19.s, z28.b, z2.b\n" "ld1rw { z1.s }, p4/Z, [x25]\n" "ld1rw { z2.s }, p4/Z, [x24]\n" - "udot z23.s, z7.b, z3.b\n" - "udot z27.s, z7.b, z4.b\n" + "udot z23.s, z28.b, z3.b\n" + "udot z27.s, z28.b, z4.b\n" "ld1rw { z3.s }, p4/Z, [x23]\n" "ld1rw { z4.s }, p4/Z, [x22]\n" "ld1b { z7.b }, p4/Z, [x10, #1, MUL VL]\n" @@ -735,50 +735,50 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "cmp x28, x20\n" "udot z24.s, z6.b, z4.b\n" "udot z9.s, z7.b, z0.b\n" - "ld1b { z6.b }, p4/Z, [x10, #2, MUL VL]\n" + "ld1b { z29.b }, p4/Z, [x10, #2, MUL VL]\n" "udot z13.s, z7.b, z1.b\n" "udot z17.s, z7.b, z2.b\n" "udot z21.s, z7.b, z3.b\n" "udot z25.s, z7.b, z4.b\n" - "ld1b { z7.b }, p4/Z, [x10, #3, MUL VL]\n" + "ld1b { z28.b }, p4/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b\n" - "udot z14.s, z6.b, z1.b\n" - "udot z18.s, z6.b, z2.b\n" - "udot z22.s, z6.b, z3.b\n" - "udot z26.s, z6.b, z4.b\n" - "udot z11.s, z7.b, z0.b\n" - "udot z15.s, z7.b, z1.b\n" - "udot z19.s, z7.b, z2.b\n" - "udot z23.s, z7.b, z3.b\n" - "udot z27.s, z7.b, z4.b\n" + "udot z10.s, z29.b, z0.b\n" + "udot z14.s, z29.b, z1.b\n" + "udot z18.s, z29.b, z2.b\n" + "udot z22.s, z29.b, z3.b\n" + "udot z26.s, z29.b, z4.b\n" + "udot z11.s, z28.b, z0.b\n" + "udot z15.s, z28.b, z1.b\n" + "udot z19.s, z28.b, z2.b\n" + "udot z23.s, z28.b, z3.b\n" + "udot z27.s, z28.b, z4.b\n" "bne 45b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "st1w { z8.s }, p3, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "st1w { z8.s }, p3, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p2, [x9, #1, MUL VL]\n" "st1w { z10.s }, p1, [x9, #2, MUL VL]\n" "st1w { z11.s }, p0, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p3, [x24]\n" - "st1w { z13.s }, p2, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p1, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p0, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p3, [x23]\n" - "st1w { z17.s }, p2, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p1, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p0, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p3, [x22]\n" - "st1w { z21.s }, p2, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p1, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p0, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p3, [x21]\n" - "st1w { z25.s }, p2, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p1, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z12.s }, p3, [x23]\n" + "st1w { z13.s }, p2, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p1, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p0, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p3, [x22]\n" + "st1w { z17.s }, p2, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p1, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p0, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p3, [x21]\n" + "st1w { z21.s }, p2, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p1, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p0, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p3, [x20]\n" + "st1w { z25.s }, p2, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p1, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p0, [x20, #3, MUL VL]\n" "50:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -862,16 +862,16 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "55:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 56f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 57f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -883,11 +883,11 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "b 57f\n" "56:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "57:" // Height 6: input setup done "subs x27, x27, #0x4\n" "ld1rw { z0.s }, p4/Z, [x26]\n" @@ -1022,7 +1022,6 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "62:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1030,4 +1029,4 @@ void sve_hybrid_u8u32_dot_6x4VL_a64fx ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp index 30a108af7e..7871c0b003 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp @@ -115,11 +115,11 @@ void sve_hybrid_u8u32_dot_6x4VL ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -132,87 +132,87 @@ void sve_hybrid_u8u32_dot_6x4VL ( "8:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "udot z10.s, z6.b, z0.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10]\n" + "udot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z9.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z10.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z11.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #4, MUL VL]\n" + "udot z8.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z9.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z10.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z11.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z17.b, z0.b[2]\n" + "udot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z17.b, z0.b[2]\n" + "udot z11.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z17.b, z0.b[3]\n" + "udot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" + "udot z10.s, z17.b, z0.b[3]\n" + "udot z11.s, z16.b, z0.b[3]\n" "add x26, x26, #0x10\n" "bgt 8b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10]\n" + "udot z8.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z9.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z17.b, z0.b[0]\n" + "udot z11.s, z16.b, z0.b[0]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[1]\n" + "udot z9.s, z16.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" + "udot z10.s, z17.b, z0.b[1]\n" + "udot z11.s, z16.b, z0.b[1]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[2]\n" + "udot z9.s, z16.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" + "udot z10.s, z17.b, z0.b[2]\n" + "udot z11.s, z16.b, z0.b[2]\n" "addvl x10, x10, #4\n" "ble 10f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[3]\n" + "udot z9.s, z16.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z17.b, z0.b[3]\n" + "udot z11.s, z16.b, z0.b[3]\n" "addvl x10, x10, #4\n" "10:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -244,15 +244,15 @@ void sve_hybrid_u8u32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 14f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" "b 15f\n" "14:" // Height 2: no accumulate "mov z8.s, #0x0\n" @@ -268,12 +268,12 @@ void sve_hybrid_u8u32_dot_6x4VL ( "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -281,146 +281,146 @@ void sve_hybrid_u8u32_dot_6x4VL ( "b 18f\n" "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "18:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 20f\n" "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z1.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z1.b[0]\n" + "udot z12.s, z17.b, z0.b[0]\n" + "udot z9.s, z16.b, z1.b[0]\n" + "udot z13.s, z16.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z17.b, z1.b[0]\n" + "udot z14.s, z17.b, z0.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" "cmp x27, #0x10\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z11.s, z16.b, z1.b[0]\n" + "udot z15.s, z16.b, z0.b[0]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" "add x26, x26, #0x10\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z8.s, z17.b, z1.b[1]\n" + "udot z12.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" "add x25, x25, #0x10\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "udot z9.s, z16.b, z1.b[1]\n" + "udot z13.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" + "udot z10.s, z17.b, z1.b[1]\n" + "udot z14.s, z17.b, z0.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "udot z11.s, z16.b, z1.b[1]\n" + "udot z15.s, z16.b, z0.b[1]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z17.b, z1.b[2]\n" + "udot z12.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "udot z9.s, z16.b, z1.b[2]\n" + "udot z13.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z17.b, z1.b[2]\n" + "udot z14.s, z17.b, z0.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "udot z11.s, z16.b, z1.b[2]\n" + "udot z15.s, z16.b, z0.b[2]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z17.b, z1.b[3]\n" + "udot z12.s, z17.b, z0.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "udot z9.s, z16.b, z1.b[3]\n" + "udot z13.s, z16.b, z0.b[3]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z10.s, z17.b, z1.b[3]\n" + "udot z14.s, z17.b, z0.b[3]\n" + "udot z11.s, z16.b, z1.b[3]\n" + "udot z15.s, z16.b, z0.b[3]\n" "bgt 19b\n" "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z0.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[0]\n" + "udot z12.s, z17.b, z1.b[0]\n" + "udot z9.s, z16.b, z0.b[0]\n" + "udot z13.s, z16.b, z1.b[0]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z17.b, z0.b[0]\n" + "udot z14.s, z17.b, z1.b[0]\n" "addvl x10, x10, #4\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" + "udot z11.s, z16.b, z0.b[0]\n" + "udot z15.s, z16.b, z1.b[0]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[1]\n" + "udot z12.s, z17.b, z1.b[1]\n" + "udot z9.s, z16.b, z0.b[1]\n" + "udot z13.s, z16.b, z1.b[1]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" + "udot z10.s, z17.b, z0.b[1]\n" + "udot z14.s, z17.b, z1.b[1]\n" "addvl x10, x10, #4\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" + "udot z11.s, z16.b, z0.b[1]\n" + "udot z15.s, z16.b, z1.b[1]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[2]\n" + "udot z12.s, z17.b, z1.b[2]\n" + "udot z9.s, z16.b, z0.b[2]\n" + "udot z13.s, z16.b, z1.b[2]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" + "udot z10.s, z17.b, z0.b[2]\n" + "udot z14.s, z17.b, z1.b[2]\n" "addvl x10, x10, #4\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" + "udot z11.s, z16.b, z0.b[2]\n" + "udot z15.s, z16.b, z1.b[2]\n" "ble 21f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z17.b, z0.b[3]\n" + "udot z12.s, z17.b, z1.b[3]\n" + "udot z9.s, z16.b, z0.b[3]\n" + "udot z13.s, z16.b, z1.b[3]\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z17.b, z0.b[3]\n" + "udot z14.s, z17.b, z1.b[3]\n" "addvl x10, x10, #4\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" + "udot z11.s, z16.b, z0.b[3]\n" + "udot z15.s, z16.b, z1.b[3]\n" "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 16b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" + "add x20, x9, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x20]\n" + "st1w { z13.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x20, #3, MUL VL]\n" "22:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -441,20 +441,20 @@ void sve_hybrid_u8u32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x20]\n" + "ld1w { z17.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x20, #3, MUL VL]\n" "b 26f\n" "25:" // Height 3: no accumulate "mov z8.s, #0x0\n" @@ -474,13 +474,13 @@ void sve_hybrid_u8u32_dot_6x4VL ( "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 28f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 29f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -489,86 +489,86 @@ void sve_hybrid_u8u32_dot_6x4VL ( "b 29f\n" "28:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "29:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 31f\n" "30:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x26]\n" "ld1rqb { z1.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x24]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "udot z8.s, z21.b, z2.b[0]\n" + "udot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z16.s, z21.b, z0.b[0]\n" + "udot z9.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z20.b, z1.b[0]\n" + "udot z17.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "cmp x27, #0x10\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" + "udot z10.s, z21.b, z2.b[0]\n" + "udot z14.s, z21.b, z1.b[0]\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" + "udot z18.s, z21.b, z0.b[0]\n" + "udot z11.s, z20.b, z2.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "udot z15.s, z20.b, z1.b[0]\n" + "udot z19.s, z20.b, z0.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z8.s, z21.b, z2.b[1]\n" + "udot z12.s, z21.b, z1.b[1]\n" + "udot z16.s, z21.b, z0.b[1]\n" + "udot z9.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z13.s, z20.b, z1.b[1]\n" + "udot z17.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" + "udot z10.s, z21.b, z2.b[1]\n" + "udot z14.s, z21.b, z1.b[1]\n" + "udot z18.s, z21.b, z0.b[1]\n" + "udot z11.s, z20.b, z2.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #-8, MUL VL]\n" + "udot z15.s, z20.b, z1.b[1]\n" + "udot z19.s, z20.b, z0.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z21.b, z2.b[2]\n" + "udot z12.s, z21.b, z1.b[2]\n" + "udot z16.s, z21.b, z0.b[2]\n" + "udot z9.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #-6, MUL VL]\n" + "udot z13.s, z20.b, z1.b[2]\n" + "udot z17.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z21.b, z2.b[2]\n" + "udot z14.s, z21.b, z1.b[2]\n" + "udot z18.s, z21.b, z0.b[2]\n" + "udot z11.s, z20.b, z2.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #-4, MUL VL]\n" + "udot z15.s, z20.b, z1.b[2]\n" + "udot z19.s, z20.b, z0.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z21.b, z2.b[3]\n" + "udot z12.s, z21.b, z1.b[3]\n" + "udot z16.s, z21.b, z0.b[3]\n" + "udot z9.s, z20.b, z2.b[3]\n" + "ld1b { z21.b }, p5/Z, [x10, #-2, MUL VL]\n" + "udot z13.s, z20.b, z1.b[3]\n" + "udot z17.s, z20.b, z0.b[3]\n" + "ld1b { z20.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z10.s, z21.b, z2.b[3]\n" + "udot z14.s, z21.b, z1.b[3]\n" + "udot z18.s, z21.b, z0.b[3]\n" + "udot z11.s, z20.b, z2.b[3]\n" + "udot z15.s, z20.b, z1.b[3]\n" + "udot z19.s, z20.b, z0.b[3]\n" "bgt 30b\n" "31:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -576,100 +576,100 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z1.b }, p0/Z, [x25]\n" "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "udot z8.s, z21.b, z0.b[0]\n" + "udot z12.s, z21.b, z1.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z16.s, z21.b, z2.b[0]\n" + "udot z9.s, z20.b, z0.b[0]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z20.b, z1.b[0]\n" + "udot z17.s, z20.b, z2.b[0]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" + "udot z10.s, z21.b, z0.b[0]\n" + "udot z14.s, z21.b, z1.b[0]\n" + "udot z18.s, z21.b, z2.b[0]\n" + "udot z11.s, z20.b, z0.b[0]\n" + "udot z15.s, z20.b, z1.b[0]\n" + "udot z19.s, z20.b, z2.b[0]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z21.b, z0.b[1]\n" + "udot z12.s, z21.b, z1.b[1]\n" + "udot z16.s, z21.b, z2.b[1]\n" + "udot z9.s, z20.b, z0.b[1]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z13.s, z20.b, z1.b[1]\n" + "udot z17.s, z20.b, z2.b[1]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" + "udot z10.s, z21.b, z0.b[1]\n" + "udot z14.s, z21.b, z1.b[1]\n" + "udot z18.s, z21.b, z2.b[1]\n" + "udot z11.s, z20.b, z0.b[1]\n" + "udot z15.s, z20.b, z1.b[1]\n" + "udot z19.s, z20.b, z2.b[1]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z21.b, z0.b[2]\n" + "udot z12.s, z21.b, z1.b[2]\n" + "udot z16.s, z21.b, z2.b[2]\n" + "udot z9.s, z20.b, z0.b[2]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z13.s, z20.b, z1.b[2]\n" + "udot z17.s, z20.b, z2.b[2]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" + "udot z10.s, z21.b, z0.b[2]\n" + "udot z14.s, z21.b, z1.b[2]\n" + "udot z18.s, z21.b, z2.b[2]\n" + "udot z11.s, z20.b, z0.b[2]\n" + "udot z15.s, z20.b, z1.b[2]\n" + "udot z19.s, z20.b, z2.b[2]\n" "ble 32f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z21.b }, p5/Z, [x10]\n" + "ld1b { z20.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z21.b, z0.b[3]\n" + "udot z12.s, z21.b, z1.b[3]\n" + "udot z16.s, z21.b, z2.b[3]\n" + "udot z9.s, z20.b, z0.b[3]\n" + "ld1b { z21.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z20.b, z1.b[3]\n" + "udot z17.s, z20.b, z2.b[3]\n" + "ld1b { z20.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" + "udot z10.s, z21.b, z0.b[3]\n" + "udot z14.s, z21.b, z1.b[3]\n" + "udot z18.s, z21.b, z2.b[3]\n" + "udot z11.s, z20.b, z0.b[3]\n" + "udot z15.s, z20.b, z1.b[3]\n" + "udot z19.s, z20.b, z2.b[3]\n" "32:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 27b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x21]\n" + "st1w { z13.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "33:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -690,25 +690,25 @@ void sve_hybrid_u8u32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "ld1w { z8.s }, p4/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x21]\n" + "ld1w { z17.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "b 37f\n" "36:" // Height 4: no accumulate "mov z8.s, #0x0\n" @@ -732,14 +732,14 @@ void sve_hybrid_u8u32_dot_6x4VL ( "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 39f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -749,105 +749,105 @@ void sve_hybrid_u8u32_dot_6x4VL ( "b 40f\n" "39:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "40:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 42f\n" "41:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z3.b }, p0/Z, [x26]\n" + "ld1rqb { z2.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x24]\n" + "ld1rqb { z0.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z25.b, z3.b[0]\n" + "udot z12.s, z25.b, z2.b[0]\n" + "udot z16.s, z25.b, z1.b[0]\n" + "udot z20.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "add x25, x25, #0x10\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" + "udot z9.s, z24.b, z3.b[0]\n" + "udot z13.s, z24.b, z2.b[0]\n" "add x24, x24, #0x10\n" "add x23, x23, #0x10\n" - "udot z17.s, z7.b, z2.b[0]\n" - "udot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "udot z17.s, z24.b, z1.b[0]\n" + "udot z21.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z25.b, z3.b[0]\n" + "udot z14.s, z25.b, z2.b[0]\n" + "udot z18.s, z25.b, z1.b[0]\n" + "udot z22.s, z25.b, z0.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "udot z11.s, z24.b, z3.b[0]\n" + "udot z15.s, z24.b, z2.b[0]\n" + "udot z19.s, z24.b, z1.b[0]\n" + "udot z23.s, z24.b, z0.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z8.s, z25.b, z3.b[1]\n" + "udot z12.s, z25.b, z2.b[1]\n" + "udot z16.s, z25.b, z1.b[1]\n" + "udot z20.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z9.s, z24.b, z3.b[1]\n" + "udot z13.s, z24.b, z2.b[1]\n" + "udot z17.s, z24.b, z1.b[1]\n" + "udot z21.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" + "udot z10.s, z25.b, z3.b[1]\n" + "udot z14.s, z25.b, z2.b[1]\n" + "udot z18.s, z25.b, z1.b[1]\n" + "udot z22.s, z25.b, z0.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" + "udot z11.s, z24.b, z3.b[1]\n" + "udot z15.s, z24.b, z2.b[1]\n" + "udot z19.s, z24.b, z1.b[1]\n" + "udot z23.s, z24.b, z0.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z25.b, z3.b[2]\n" + "udot z12.s, z25.b, z2.b[2]\n" + "udot z16.s, z25.b, z1.b[2]\n" + "udot z20.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" + "udot z9.s, z24.b, z3.b[2]\n" + "udot z13.s, z24.b, z2.b[2]\n" + "udot z17.s, z24.b, z1.b[2]\n" + "udot z21.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z25.b, z3.b[2]\n" + "udot z14.s, z25.b, z2.b[2]\n" + "udot z18.s, z25.b, z1.b[2]\n" + "udot z22.s, z25.b, z0.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + "udot z11.s, z24.b, z3.b[2]\n" + "udot z15.s, z24.b, z2.b[2]\n" + "udot z19.s, z24.b, z1.b[2]\n" + "udot z23.s, z24.b, z0.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z25.b, z3.b[3]\n" + "udot z12.s, z25.b, z2.b[3]\n" + "udot z16.s, z25.b, z1.b[3]\n" + "udot z20.s, z25.b, z0.b[3]\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + "udot z9.s, z24.b, z3.b[3]\n" + "udot z13.s, z24.b, z2.b[3]\n" + "udot z17.s, z24.b, z1.b[3]\n" + "udot z21.s, z24.b, z0.b[3]\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z10.s, z25.b, z3.b[3]\n" + "udot z14.s, z25.b, z2.b[3]\n" + "udot z18.s, z25.b, z1.b[3]\n" + "udot z22.s, z25.b, z0.b[3]\n" + "udot z11.s, z24.b, z3.b[3]\n" + "udot z15.s, z24.b, z2.b[3]\n" + "udot z19.s, z24.b, z1.b[3]\n" + "udot z23.s, z24.b, z0.b[3]\n" "bgt 41b\n" "42:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -856,121 +856,121 @@ void sve_hybrid_u8u32_dot_6x4VL ( "subs x27, x27, #0x4\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "udot z21.s, z7.b, z3.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z25.b, z0.b[0]\n" + "udot z12.s, z25.b, z1.b[0]\n" + "udot z16.s, z25.b, z2.b[0]\n" + "udot z20.s, z25.b, z3.b[0]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z24.b, z0.b[0]\n" + "udot z13.s, z24.b, z1.b[0]\n" + "udot z17.s, z24.b, z2.b[0]\n" + "udot z21.s, z24.b, z3.b[0]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" + "udot z10.s, z25.b, z0.b[0]\n" + "udot z14.s, z25.b, z1.b[0]\n" + "udot z18.s, z25.b, z2.b[0]\n" + "udot z22.s, z25.b, z3.b[0]\n" + "udot z11.s, z24.b, z0.b[0]\n" + "udot z15.s, z24.b, z1.b[0]\n" + "udot z19.s, z24.b, z2.b[0]\n" + "udot z23.s, z24.b, z3.b[0]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z25.b, z0.b[1]\n" + "udot z12.s, z25.b, z1.b[1]\n" + "udot z16.s, z25.b, z2.b[1]\n" + "udot z20.s, z25.b, z3.b[1]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z9.s, z24.b, z0.b[1]\n" + "udot z13.s, z24.b, z1.b[1]\n" + "udot z17.s, z24.b, z2.b[1]\n" + "udot z21.s, z24.b, z3.b[1]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" + "udot z10.s, z25.b, z0.b[1]\n" + "udot z14.s, z25.b, z1.b[1]\n" + "udot z18.s, z25.b, z2.b[1]\n" + "udot z22.s, z25.b, z3.b[1]\n" + "udot z11.s, z24.b, z0.b[1]\n" + "udot z15.s, z24.b, z1.b[1]\n" + "udot z19.s, z24.b, z2.b[1]\n" + "udot z23.s, z24.b, z3.b[1]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z25.b, z0.b[2]\n" + "udot z12.s, z25.b, z1.b[2]\n" + "udot z16.s, z25.b, z2.b[2]\n" + "udot z20.s, z25.b, z3.b[2]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x4\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z9.s, z24.b, z0.b[2]\n" + "udot z13.s, z24.b, z1.b[2]\n" + "udot z17.s, z24.b, z2.b[2]\n" + "udot z21.s, z24.b, z3.b[2]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" + "udot z10.s, z25.b, z0.b[2]\n" + "udot z14.s, z25.b, z1.b[2]\n" + "udot z18.s, z25.b, z2.b[2]\n" + "udot z22.s, z25.b, z3.b[2]\n" + "udot z11.s, z24.b, z0.b[2]\n" + "udot z15.s, z24.b, z1.b[2]\n" + "udot z19.s, z24.b, z2.b[2]\n" + "udot z23.s, z24.b, z3.b[2]\n" "ble 43f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z25.b, z0.b[3]\n" + "udot z12.s, z25.b, z1.b[3]\n" + "udot z16.s, z25.b, z2.b[3]\n" + "udot z20.s, z25.b, z3.b[3]\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z24.b, z0.b[3]\n" + "udot z13.s, z24.b, z1.b[3]\n" + "udot z17.s, z24.b, z2.b[3]\n" + "udot z21.s, z24.b, z3.b[3]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" + "udot z10.s, z25.b, z0.b[3]\n" + "udot z14.s, z25.b, z1.b[3]\n" + "udot z18.s, z25.b, z2.b[3]\n" + "udot z22.s, z25.b, z3.b[3]\n" + "udot z11.s, z24.b, z0.b[3]\n" + "udot z15.s, z24.b, z1.b[3]\n" + "udot z19.s, z24.b, z2.b[3]\n" + "udot z23.s, z24.b, z3.b[3]\n" "43:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 38b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" "st1w { z8.s }, p4, [x9]\n" - "add x22, x23, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p4, [x22]\n" - "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x22]\n" + "st1w { z13.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x20]\n" + "st1w { z21.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x20, #3, MUL VL]\n" "44:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -991,30 +991,30 @@ void sve_hybrid_u8u32_dot_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z8.s }, p4/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z8.s }, p4/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "ld1w { z9.s }, p3/Z, [x9, #1, MUL VL]\n" "ld1w { z10.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z11.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z16.s }, p4/Z, [x23]\n" - "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n" - "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z24.s }, p4/Z, [x21]\n" - "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n" - "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z16.s }, p4/Z, [x22]\n" + "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z24.s }, p4/Z, [x20]\n" + "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n" + "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n" "b 48f\n" "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -1042,15 +1042,15 @@ void sve_hybrid_u8u32_dot_6x4VL ( "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1061,124 +1061,124 @@ void sve_hybrid_u8u32_dot_6x4VL ( "b 51f\n" "50:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "51:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 53f\n" "52:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x26]\n" + "ld1rqb { z3.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" + "ld1rqb { z0.b }, p0/Z, [x22]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "udot z8.s, z29.b, z4.b[0]\n" + "udot z12.s, z29.b, z3.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z16.s, z29.b, z2.b[0]\n" + "udot z20.s, z29.b, z1.b[0]\n" "add x25, x25, #0x10\n" - "udot z24.s, z6.b, z4.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z24.s, z29.b, z0.b[0]\n" + "udot z9.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" "add x24, x24, #0x10\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" + "udot z13.s, z28.b, z3.b[0]\n" + "udot z17.s, z28.b, z2.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "udot z21.s, z7.b, z3.b[0]\n" - "udot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z26.s, z6.b, z4.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" - "udot z27.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" - "udot z24.s, z6.b, z4.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "udot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "udot z21.s, z28.b, z1.b[0]\n" + "udot z25.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z29.b, z4.b[0]\n" + "udot z14.s, z29.b, z3.b[0]\n" + "udot z18.s, z29.b, z2.b[0]\n" + "udot z22.s, z29.b, z1.b[0]\n" + "udot z26.s, z29.b, z0.b[0]\n" + "udot z11.s, z28.b, z4.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #4, MUL VL]\n" + "udot z15.s, z28.b, z3.b[0]\n" + "udot z19.s, z28.b, z2.b[0]\n" + "udot z23.s, z28.b, z1.b[0]\n" + "udot z27.s, z28.b, z0.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z8.s, z29.b, z4.b[1]\n" + "udot z12.s, z29.b, z3.b[1]\n" + "udot z16.s, z29.b, z2.b[1]\n" + "udot z20.s, z29.b, z1.b[1]\n" + "udot z24.s, z29.b, z0.b[1]\n" + "udot z9.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z13.s, z28.b, z3.b[1]\n" + "udot z17.s, z28.b, z2.b[1]\n" + "udot z21.s, z28.b, z1.b[1]\n" + "udot z25.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "udot z26.s, z6.b, z4.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" - "udot z27.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" - "udot z24.s, z6.b, z4.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "udot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "udot z26.s, z6.b, z4.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" - "udot z27.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "udot z24.s, z6.b, z4.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "udot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z26.s, z6.b, z4.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" - "udot z27.s, z7.b, z4.b[3]\n" + "udot z10.s, z29.b, z4.b[1]\n" + "udot z14.s, z29.b, z3.b[1]\n" + "udot z18.s, z29.b, z2.b[1]\n" + "udot z22.s, z29.b, z1.b[1]\n" + "udot z26.s, z29.b, z0.b[1]\n" + "udot z11.s, z28.b, z4.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #-8, MUL VL]\n" + "udot z15.s, z28.b, z3.b[1]\n" + "udot z19.s, z28.b, z2.b[1]\n" + "udot z23.s, z28.b, z1.b[1]\n" + "udot z27.s, z28.b, z0.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z29.b, z4.b[2]\n" + "udot z12.s, z29.b, z3.b[2]\n" + "udot z16.s, z29.b, z2.b[2]\n" + "udot z20.s, z29.b, z1.b[2]\n" + "udot z24.s, z29.b, z0.b[2]\n" + "udot z9.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #-6, MUL VL]\n" + "udot z13.s, z28.b, z3.b[2]\n" + "udot z17.s, z28.b, z2.b[2]\n" + "udot z21.s, z28.b, z1.b[2]\n" + "udot z25.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z29.b, z4.b[2]\n" + "udot z14.s, z29.b, z3.b[2]\n" + "udot z18.s, z29.b, z2.b[2]\n" + "udot z22.s, z29.b, z1.b[2]\n" + "udot z26.s, z29.b, z0.b[2]\n" + "udot z11.s, z28.b, z4.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #-4, MUL VL]\n" + "udot z15.s, z28.b, z3.b[2]\n" + "udot z19.s, z28.b, z2.b[2]\n" + "udot z23.s, z28.b, z1.b[2]\n" + "udot z27.s, z28.b, z0.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z29.b, z4.b[3]\n" + "udot z12.s, z29.b, z3.b[3]\n" + "udot z16.s, z29.b, z2.b[3]\n" + "udot z20.s, z29.b, z1.b[3]\n" + "udot z24.s, z29.b, z0.b[3]\n" + "udot z9.s, z28.b, z4.b[3]\n" + "ld1b { z29.b }, p5/Z, [x10, #-2, MUL VL]\n" + "udot z13.s, z28.b, z3.b[3]\n" + "udot z17.s, z28.b, z2.b[3]\n" + "udot z21.s, z28.b, z1.b[3]\n" + "udot z25.s, z28.b, z0.b[3]\n" + "ld1b { z28.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z10.s, z29.b, z4.b[3]\n" + "udot z14.s, z29.b, z3.b[3]\n" + "udot z18.s, z29.b, z2.b[3]\n" + "udot z22.s, z29.b, z1.b[3]\n" + "udot z26.s, z29.b, z0.b[3]\n" + "udot z11.s, z28.b, z4.b[3]\n" + "udot z15.s, z28.b, z3.b[3]\n" + "udot z19.s, z28.b, z2.b[3]\n" + "udot z23.s, z28.b, z1.b[3]\n" + "udot z27.s, z28.b, z0.b[3]\n" "bgt 52b\n" "53:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1188,142 +1188,142 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z2.b }, p0/Z, [x24]\n" "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" - "udot z24.s, z6.b, z4.b[0]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "udot z21.s, z7.b, z3.b[0]\n" - "udot z25.s, z7.b, z4.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "udot z8.s, z29.b, z0.b[0]\n" + "udot z12.s, z29.b, z1.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z16.s, z29.b, z2.b[0]\n" + "udot z20.s, z29.b, z3.b[0]\n" + "udot z24.s, z29.b, z4.b[0]\n" + "udot z9.s, z28.b, z0.b[0]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z28.b, z1.b[0]\n" + "udot z17.s, z28.b, z2.b[0]\n" + "udot z21.s, z28.b, z3.b[0]\n" + "udot z25.s, z28.b, z4.b[0]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z26.s, z6.b, z4.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" - "udot z27.s, z7.b, z4.b[0]\n" + "udot z10.s, z29.b, z0.b[0]\n" + "udot z14.s, z29.b, z1.b[0]\n" + "udot z18.s, z29.b, z2.b[0]\n" + "udot z22.s, z29.b, z3.b[0]\n" + "udot z26.s, z29.b, z4.b[0]\n" + "udot z11.s, z28.b, z0.b[0]\n" + "udot z15.s, z28.b, z1.b[0]\n" + "udot z19.s, z28.b, z2.b[0]\n" + "udot z23.s, z28.b, z3.b[0]\n" + "udot z27.s, z28.b, z4.b[0]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z29.b, z0.b[1]\n" + "udot z12.s, z29.b, z1.b[1]\n" + "udot z16.s, z29.b, z2.b[1]\n" + "udot z20.s, z29.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "udot z24.s, z6.b, z4.b[1]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "udot z25.s, z7.b, z4.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z24.s, z29.b, z4.b[1]\n" + "udot z9.s, z28.b, z0.b[1]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z28.b, z1.b[1]\n" + "udot z17.s, z28.b, z2.b[1]\n" + "udot z21.s, z28.b, z3.b[1]\n" + "udot z25.s, z28.b, z4.b[1]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "udot z26.s, z6.b, z4.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" - "udot z27.s, z7.b, z4.b[1]\n" + "udot z10.s, z29.b, z0.b[1]\n" + "udot z14.s, z29.b, z1.b[1]\n" + "udot z18.s, z29.b, z2.b[1]\n" + "udot z22.s, z29.b, z3.b[1]\n" + "udot z26.s, z29.b, z4.b[1]\n" + "udot z11.s, z28.b, z0.b[1]\n" + "udot z15.s, z28.b, z1.b[1]\n" + "udot z19.s, z28.b, z2.b[1]\n" + "udot z23.s, z28.b, z3.b[1]\n" + "udot z27.s, z28.b, z4.b[1]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z29.b, z0.b[2]\n" + "udot z12.s, z29.b, z1.b[2]\n" + "udot z16.s, z29.b, z2.b[2]\n" + "udot z20.s, z29.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "udot z24.s, z6.b, z4.b[2]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "udot z25.s, z7.b, z4.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z24.s, z29.b, z4.b[2]\n" + "udot z9.s, z28.b, z0.b[2]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z28.b, z1.b[2]\n" + "udot z17.s, z28.b, z2.b[2]\n" + "udot z21.s, z28.b, z3.b[2]\n" + "udot z25.s, z28.b, z4.b[2]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "udot z26.s, z6.b, z4.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" - "udot z27.s, z7.b, z4.b[2]\n" + "udot z10.s, z29.b, z0.b[2]\n" + "udot z14.s, z29.b, z1.b[2]\n" + "udot z18.s, z29.b, z2.b[2]\n" + "udot z22.s, z29.b, z3.b[2]\n" + "udot z26.s, z29.b, z4.b[2]\n" + "udot z11.s, z28.b, z0.b[2]\n" + "udot z15.s, z28.b, z1.b[2]\n" + "udot z19.s, z28.b, z2.b[2]\n" + "udot z23.s, z28.b, z3.b[2]\n" + "udot z27.s, z28.b, z4.b[2]\n" "ble 54f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "udot z24.s, z6.b, z4.b[3]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "udot z25.s, z7.b, z4.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z29.b }, p5/Z, [x10]\n" + "ld1b { z28.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z29.b, z0.b[3]\n" + "udot z12.s, z29.b, z1.b[3]\n" + "udot z16.s, z29.b, z2.b[3]\n" + "udot z20.s, z29.b, z3.b[3]\n" + "udot z24.s, z29.b, z4.b[3]\n" + "udot z9.s, z28.b, z0.b[3]\n" + "ld1b { z29.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z13.s, z28.b, z1.b[3]\n" + "udot z17.s, z28.b, z2.b[3]\n" + "udot z21.s, z28.b, z3.b[3]\n" + "udot z25.s, z28.b, z4.b[3]\n" + "ld1b { z28.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z26.s, z6.b, z4.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" - "udot z27.s, z7.b, z4.b[3]\n" + "udot z10.s, z29.b, z0.b[3]\n" + "udot z14.s, z29.b, z1.b[3]\n" + "udot z18.s, z29.b, z2.b[3]\n" + "udot z22.s, z29.b, z3.b[3]\n" + "udot z26.s, z29.b, z4.b[3]\n" + "udot z11.s, z28.b, z0.b[3]\n" + "udot z15.s, z28.b, z1.b[3]\n" + "udot z19.s, z28.b, z2.b[3]\n" + "udot z23.s, z28.b, z3.b[3]\n" + "udot z27.s, z28.b, z4.b[3]\n" "54:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 49b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "st1w { z8.s }, p4, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "st1w { z8.s }, p4, [x9]\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "st1w { z9.s }, p3, [x9, #1, MUL VL]\n" "st1w { z10.s }, p2, [x9, #2, MUL VL]\n" "st1w { z11.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z12.s }, p4, [x24]\n" - "st1w { z13.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z14.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z15.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z20.s }, p4, [x22]\n" - "st1w { z21.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z22.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z23.s }, p1, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p4, [x21]\n" - "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z12.s }, p4, [x23]\n" + "st1w { z13.s }, p3, [x23, #1, MUL VL]\n" + "st1w { z14.s }, p2, [x23, #2, MUL VL]\n" + "st1w { z15.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x22]\n" + "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z20.s }, p4, [x21]\n" + "st1w { z21.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z22.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z23.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" "55:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -1407,16 +1407,16 @@ void sve_hybrid_u8u32_dot_6x4VL ( "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1428,143 +1428,143 @@ void sve_hybrid_u8u32_dot_6x4VL ( "b 62f\n" "61:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "62:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 64f\n" "63:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z0.b }, p0/Z, [x26]\n" - "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z6.b }, p0/Z, [x25]\n" "sub x27, x27, #0x10\n" - "ld1rqb { z2.b }, p0/Z, [x24]\n" - "ld1rqb { z3.b }, p0/Z, [x23]\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z4.b }, p0/Z, [x23]\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - "ld1rqb { z4.b }, p0/Z, [x22]\n" - "ld1rqb { z5.b }, p0/Z, [x21]\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z2.b }, p0/Z, [x21]\n" "add x25, x25, #0x10\n" "add x24, x24, #0x10\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z1.b, z7.b[0]\n" + "udot z12.s, z1.b, z6.b[0]\n" + "udot z16.s, z1.b, z5.b[0]\n" + "udot z20.s, z1.b, z4.b[0]\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - "udot z24.s, z6.b, z4.b[0]\n" - "udot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z24.s, z1.b, z3.b[0]\n" + "udot z28.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "add x21, x21, #0x10\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "udot z21.s, z7.b, z3.b[0]\n" - "udot z25.s, z7.b, z4.b[0]\n" - "udot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z26.s, z6.b, z4.b[0]\n" - "udot z30.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #4, MUL VL]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" - "udot z27.s, z7.b, z4.b[0]\n" - "udot z31.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #5, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" - "udot z24.s, z6.b, z4.b[1]\n" - "udot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #6, MUL VL]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "udot z25.s, z7.b, z4.b[1]\n" - "udot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #7, MUL VL]\n" + "udot z9.s, z0.b, z7.b[0]\n" + "udot z13.s, z0.b, z6.b[0]\n" + "udot z17.s, z0.b, z5.b[0]\n" + "udot z21.s, z0.b, z4.b[0]\n" + "udot z25.s, z0.b, z3.b[0]\n" + "udot z29.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z10.s, z1.b, z7.b[0]\n" + "udot z14.s, z1.b, z6.b[0]\n" + "udot z18.s, z1.b, z5.b[0]\n" + "udot z22.s, z1.b, z4.b[0]\n" + "udot z26.s, z1.b, z3.b[0]\n" + "udot z30.s, z1.b, z2.b[0]\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" + "udot z11.s, z0.b, z7.b[0]\n" + "udot z15.s, z0.b, z6.b[0]\n" + "udot z19.s, z0.b, z5.b[0]\n" + "udot z23.s, z0.b, z4.b[0]\n" + "udot z27.s, z0.b, z3.b[0]\n" + "udot z31.s, z0.b, z2.b[0]\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + "udot z8.s, z1.b, z7.b[1]\n" + "udot z12.s, z1.b, z6.b[1]\n" + "udot z16.s, z1.b, z5.b[1]\n" + "udot z20.s, z1.b, z4.b[1]\n" + "udot z24.s, z1.b, z3.b[1]\n" + "udot z28.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + "udot z9.s, z0.b, z7.b[1]\n" + "udot z13.s, z0.b, z6.b[1]\n" + "udot z17.s, z0.b, z5.b[1]\n" + "udot z21.s, z0.b, z4.b[1]\n" + "udot z25.s, z0.b, z3.b[1]\n" + "udot z29.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "udot z26.s, z6.b, z4.b[1]\n" - "udot z30.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #-8, MUL VL]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" - "udot z27.s, z7.b, z4.b[1]\n" - "udot z31.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #-7, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" - "udot z24.s, z6.b, z4.b[2]\n" - "udot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-6, MUL VL]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "udot z25.s, z7.b, z4.b[2]\n" - "udot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-5, MUL VL]\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "udot z26.s, z6.b, z4.b[2]\n" - "udot z30.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #-4, MUL VL]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" - "udot z27.s, z7.b, z4.b[2]\n" - "udot z31.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "udot z24.s, z6.b, z4.b[3]\n" - "udot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #-2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "udot z25.s, z7.b, z4.b[3]\n" - "udot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #-1, MUL VL]\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z26.s, z6.b, z4.b[3]\n" - "udot z30.s, z6.b, z5.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" - "udot z27.s, z7.b, z4.b[3]\n" - "udot z31.s, z7.b, z5.b[3]\n" + "udot z10.s, z1.b, z7.b[1]\n" + "udot z14.s, z1.b, z6.b[1]\n" + "udot z18.s, z1.b, z5.b[1]\n" + "udot z22.s, z1.b, z4.b[1]\n" + "udot z26.s, z1.b, z3.b[1]\n" + "udot z30.s, z1.b, z2.b[1]\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + "udot z11.s, z0.b, z7.b[1]\n" + "udot z15.s, z0.b, z6.b[1]\n" + "udot z19.s, z0.b, z5.b[1]\n" + "udot z23.s, z0.b, z4.b[1]\n" + "udot z27.s, z0.b, z3.b[1]\n" + "udot z31.s, z0.b, z2.b[1]\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + "udot z8.s, z1.b, z7.b[2]\n" + "udot z12.s, z1.b, z6.b[2]\n" + "udot z16.s, z1.b, z5.b[2]\n" + "udot z20.s, z1.b, z4.b[2]\n" + "udot z24.s, z1.b, z3.b[2]\n" + "udot z28.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + "udot z9.s, z0.b, z7.b[2]\n" + "udot z13.s, z0.b, z6.b[2]\n" + "udot z17.s, z0.b, z5.b[2]\n" + "udot z21.s, z0.b, z4.b[2]\n" + "udot z25.s, z0.b, z3.b[2]\n" + "udot z29.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + "udot z10.s, z1.b, z7.b[2]\n" + "udot z14.s, z1.b, z6.b[2]\n" + "udot z18.s, z1.b, z5.b[2]\n" + "udot z22.s, z1.b, z4.b[2]\n" + "udot z26.s, z1.b, z3.b[2]\n" + "udot z30.s, z1.b, z2.b[2]\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + "udot z11.s, z0.b, z7.b[2]\n" + "udot z15.s, z0.b, z6.b[2]\n" + "udot z19.s, z0.b, z5.b[2]\n" + "udot z23.s, z0.b, z4.b[2]\n" + "udot z27.s, z0.b, z3.b[2]\n" + "udot z31.s, z0.b, z2.b[2]\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + "udot z8.s, z1.b, z7.b[3]\n" + "udot z12.s, z1.b, z6.b[3]\n" + "udot z16.s, z1.b, z5.b[3]\n" + "udot z20.s, z1.b, z4.b[3]\n" + "udot z24.s, z1.b, z3.b[3]\n" + "udot z28.s, z1.b, z2.b[3]\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + "udot z9.s, z0.b, z7.b[3]\n" + "udot z13.s, z0.b, z6.b[3]\n" + "udot z17.s, z0.b, z5.b[3]\n" + "udot z21.s, z0.b, z4.b[3]\n" + "udot z25.s, z0.b, z3.b[3]\n" + "udot z29.s, z0.b, z2.b[3]\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + "udot z10.s, z1.b, z7.b[3]\n" + "udot z14.s, z1.b, z6.b[3]\n" + "udot z18.s, z1.b, z5.b[3]\n" + "udot z22.s, z1.b, z4.b[3]\n" + "udot z26.s, z1.b, z3.b[3]\n" + "udot z30.s, z1.b, z2.b[3]\n" + "udot z11.s, z0.b, z7.b[3]\n" + "udot z15.s, z0.b, z6.b[3]\n" + "udot z19.s, z0.b, z5.b[3]\n" + "udot z23.s, z0.b, z4.b[3]\n" + "udot z27.s, z0.b, z3.b[3]\n" + "udot z31.s, z0.b, z2.b[3]\n" "bgt 63b\n" "64:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" @@ -1575,127 +1575,127 @@ void sve_hybrid_u8u32_dot_6x4VL ( "ld1rqb { z3.b }, p0/Z, [x23]\n" "ld1rqb { z4.b }, p0/Z, [x22]\n" "ld1rqb { z5.b }, p0/Z, [x21]\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[0]\n" - "udot z12.s, z6.b, z1.b[0]\n" - "udot z16.s, z6.b, z2.b[0]\n" - "udot z20.s, z6.b, z3.b[0]\n" - "udot z24.s, z6.b, z4.b[0]\n" - "udot z28.s, z6.b, z5.b[0]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[0]\n" - "udot z13.s, z7.b, z1.b[0]\n" - "udot z17.s, z7.b, z2.b[0]\n" - "udot z21.s, z7.b, z3.b[0]\n" - "udot z25.s, z7.b, z4.b[0]\n" - "udot z29.s, z7.b, z5.b[0]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z7.b, z0.b[0]\n" + "udot z12.s, z7.b, z1.b[0]\n" + "udot z16.s, z7.b, z2.b[0]\n" + "udot z20.s, z7.b, z3.b[0]\n" + "udot z24.s, z7.b, z4.b[0]\n" + "udot z28.s, z7.b, z5.b[0]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z6.b, z0.b[0]\n" + "udot z13.s, z6.b, z1.b[0]\n" + "udot z17.s, z6.b, z2.b[0]\n" + "udot z21.s, z6.b, z3.b[0]\n" + "udot z25.s, z6.b, z4.b[0]\n" + "udot z29.s, z6.b, z5.b[0]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[0]\n" - "udot z14.s, z6.b, z1.b[0]\n" - "udot z18.s, z6.b, z2.b[0]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z26.s, z6.b, z4.b[0]\n" - "udot z30.s, z6.b, z5.b[0]\n" - "udot z11.s, z7.b, z0.b[0]\n" - "udot z15.s, z7.b, z1.b[0]\n" - "udot z19.s, z7.b, z2.b[0]\n" - "udot z23.s, z7.b, z3.b[0]\n" - "udot z27.s, z7.b, z4.b[0]\n" - "udot z31.s, z7.b, z5.b[0]\n" + "udot z10.s, z7.b, z0.b[0]\n" + "udot z14.s, z7.b, z1.b[0]\n" + "udot z18.s, z7.b, z2.b[0]\n" + "udot z22.s, z7.b, z3.b[0]\n" + "udot z26.s, z7.b, z4.b[0]\n" + "udot z30.s, z7.b, z5.b[0]\n" + "udot z11.s, z6.b, z0.b[0]\n" + "udot z15.s, z6.b, z1.b[0]\n" + "udot z19.s, z6.b, z2.b[0]\n" + "udot z23.s, z6.b, z3.b[0]\n" + "udot z27.s, z6.b, z4.b[0]\n" + "udot z31.s, z6.b, z5.b[0]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[1]\n" - "udot z12.s, z6.b, z1.b[1]\n" - "udot z16.s, z6.b, z2.b[1]\n" - "udot z20.s, z6.b, z3.b[1]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z7.b, z0.b[1]\n" + "udot z12.s, z7.b, z1.b[1]\n" + "udot z16.s, z7.b, z2.b[1]\n" + "udot z20.s, z7.b, z3.b[1]\n" "subs x27, x27, #0x4\n" - "udot z24.s, z6.b, z4.b[1]\n" - "udot z28.s, z6.b, z5.b[1]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[1]\n" - "udot z13.s, z7.b, z1.b[1]\n" - "udot z17.s, z7.b, z2.b[1]\n" - "udot z21.s, z7.b, z3.b[1]\n" - "udot z25.s, z7.b, z4.b[1]\n" - "udot z29.s, z7.b, z5.b[1]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z24.s, z7.b, z4.b[1]\n" + "udot z28.s, z7.b, z5.b[1]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z6.b, z0.b[1]\n" + "udot z13.s, z6.b, z1.b[1]\n" + "udot z17.s, z6.b, z2.b[1]\n" + "udot z21.s, z6.b, z3.b[1]\n" + "udot z25.s, z6.b, z4.b[1]\n" + "udot z29.s, z6.b, z5.b[1]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[1]\n" - "udot z14.s, z6.b, z1.b[1]\n" - "udot z18.s, z6.b, z2.b[1]\n" - "udot z22.s, z6.b, z3.b[1]\n" - "udot z26.s, z6.b, z4.b[1]\n" - "udot z30.s, z6.b, z5.b[1]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z15.s, z7.b, z1.b[1]\n" - "udot z19.s, z7.b, z2.b[1]\n" - "udot z23.s, z7.b, z3.b[1]\n" - "udot z27.s, z7.b, z4.b[1]\n" - "udot z31.s, z7.b, z5.b[1]\n" + "udot z10.s, z7.b, z0.b[1]\n" + "udot z14.s, z7.b, z1.b[1]\n" + "udot z18.s, z7.b, z2.b[1]\n" + "udot z22.s, z7.b, z3.b[1]\n" + "udot z26.s, z7.b, z4.b[1]\n" + "udot z30.s, z7.b, z5.b[1]\n" + "udot z11.s, z6.b, z0.b[1]\n" + "udot z15.s, z6.b, z1.b[1]\n" + "udot z19.s, z6.b, z2.b[1]\n" + "udot z23.s, z6.b, z3.b[1]\n" + "udot z27.s, z6.b, z4.b[1]\n" + "udot z31.s, z6.b, z5.b[1]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[2]\n" - "udot z12.s, z6.b, z1.b[2]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z20.s, z6.b, z3.b[2]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z7.b, z0.b[2]\n" + "udot z12.s, z7.b, z1.b[2]\n" + "udot z16.s, z7.b, z2.b[2]\n" + "udot z20.s, z7.b, z3.b[2]\n" "subs x27, x27, #0x4\n" - "udot z24.s, z6.b, z4.b[2]\n" - "udot z28.s, z6.b, z5.b[2]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[2]\n" - "udot z13.s, z7.b, z1.b[2]\n" - "udot z17.s, z7.b, z2.b[2]\n" - "udot z21.s, z7.b, z3.b[2]\n" - "udot z25.s, z7.b, z4.b[2]\n" - "udot z29.s, z7.b, z5.b[2]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "udot z24.s, z7.b, z4.b[2]\n" + "udot z28.s, z7.b, z5.b[2]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z6.b, z0.b[2]\n" + "udot z13.s, z6.b, z1.b[2]\n" + "udot z17.s, z6.b, z2.b[2]\n" + "udot z21.s, z6.b, z3.b[2]\n" + "udot z25.s, z6.b, z4.b[2]\n" + "udot z29.s, z6.b, z5.b[2]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[2]\n" - "udot z14.s, z6.b, z1.b[2]\n" - "udot z18.s, z6.b, z2.b[2]\n" - "udot z22.s, z6.b, z3.b[2]\n" - "udot z26.s, z6.b, z4.b[2]\n" - "udot z30.s, z6.b, z5.b[2]\n" - "udot z11.s, z7.b, z0.b[2]\n" - "udot z15.s, z7.b, z1.b[2]\n" - "udot z19.s, z7.b, z2.b[2]\n" - "udot z23.s, z7.b, z3.b[2]\n" - "udot z27.s, z7.b, z4.b[2]\n" - "udot z31.s, z7.b, z5.b[2]\n" + "udot z10.s, z7.b, z0.b[2]\n" + "udot z14.s, z7.b, z1.b[2]\n" + "udot z18.s, z7.b, z2.b[2]\n" + "udot z22.s, z7.b, z3.b[2]\n" + "udot z26.s, z7.b, z4.b[2]\n" + "udot z30.s, z7.b, z5.b[2]\n" + "udot z11.s, z6.b, z0.b[2]\n" + "udot z15.s, z6.b, z1.b[2]\n" + "udot z19.s, z6.b, z2.b[2]\n" + "udot z23.s, z6.b, z3.b[2]\n" + "udot z27.s, z6.b, z4.b[2]\n" + "udot z31.s, z6.b, z5.b[2]\n" "ble 65f\n" - "ld1b { z6.b }, p5/Z, [x10]\n" - "ld1b { z7.b }, p5/Z, [x10, #1, MUL VL]\n" - "udot z8.s, z6.b, z0.b[3]\n" - "udot z12.s, z6.b, z1.b[3]\n" - "udot z16.s, z6.b, z2.b[3]\n" - "udot z20.s, z6.b, z3.b[3]\n" - "udot z24.s, z6.b, z4.b[3]\n" - "udot z28.s, z6.b, z5.b[3]\n" - "ld1b { z6.b }, p5/Z, [x10, #2, MUL VL]\n" - "udot z9.s, z7.b, z0.b[3]\n" - "udot z13.s, z7.b, z1.b[3]\n" - "udot z17.s, z7.b, z2.b[3]\n" - "udot z21.s, z7.b, z3.b[3]\n" - "udot z25.s, z7.b, z4.b[3]\n" - "udot z29.s, z7.b, z5.b[3]\n" - "ld1b { z7.b }, p5/Z, [x10, #3, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10]\n" + "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" + "udot z8.s, z7.b, z0.b[3]\n" + "udot z12.s, z7.b, z1.b[3]\n" + "udot z16.s, z7.b, z2.b[3]\n" + "udot z20.s, z7.b, z3.b[3]\n" + "udot z24.s, z7.b, z4.b[3]\n" + "udot z28.s, z7.b, z5.b[3]\n" + "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "udot z9.s, z6.b, z0.b[3]\n" + "udot z13.s, z6.b, z1.b[3]\n" + "udot z17.s, z6.b, z2.b[3]\n" + "udot z21.s, z6.b, z3.b[3]\n" + "udot z25.s, z6.b, z4.b[3]\n" + "udot z29.s, z6.b, z5.b[3]\n" + "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" "addvl x10, x10, #4\n" - "udot z10.s, z6.b, z0.b[3]\n" - "udot z14.s, z6.b, z1.b[3]\n" - "udot z18.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[3]\n" - "udot z26.s, z6.b, z4.b[3]\n" - "udot z30.s, z6.b, z5.b[3]\n" - "udot z11.s, z7.b, z0.b[3]\n" - "udot z15.s, z7.b, z1.b[3]\n" - "udot z19.s, z7.b, z2.b[3]\n" - "udot z23.s, z7.b, z3.b[3]\n" - "udot z27.s, z7.b, z4.b[3]\n" - "udot z31.s, z7.b, z5.b[3]\n" + "udot z10.s, z7.b, z0.b[3]\n" + "udot z14.s, z7.b, z1.b[3]\n" + "udot z18.s, z7.b, z2.b[3]\n" + "udot z22.s, z7.b, z3.b[3]\n" + "udot z26.s, z7.b, z4.b[3]\n" + "udot z30.s, z7.b, z5.b[3]\n" + "udot z11.s, z6.b, z0.b[3]\n" + "udot z15.s, z6.b, z1.b[3]\n" + "udot z19.s, z6.b, z2.b[3]\n" + "udot z23.s, z6.b, z3.b[3]\n" + "udot z27.s, z6.b, z4.b[3]\n" + "udot z31.s, z6.b, z5.b[3]\n" "65:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1748,7 +1748,6 @@ void sve_hybrid_u8u32_dot_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "68:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1756,4 +1755,4 @@ void sve_hybrid_u8u32_dot_6x4VL ( } } // namespace arm_gemm -#endif // ARM_COMPUTE_ENABLE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp index c0d089278e..8c6a3dba7d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -74,7 +74,6 @@ public: template static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) { - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -86,7 +85,6 @@ public: } } - if (std::is_same::value) { switch (ci->get_cpu_model()) { default: @@ -111,5 +109,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp index 59f33289b4..9269576d90 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp @@ -100,16 +100,16 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "incw x20\n" "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 3f\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip1 z9.d, z18.d, z13.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 4f\n" @@ -127,11 +127,11 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "5:" // Height 1: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 6f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" "cbnz x28, 7f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -143,86 +143,86 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "ble 9f\n" "8:" // Height 1: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n" + ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n" + ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n" + ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n" + ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n" + ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n" + ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n" + "ld1b { z16.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z7.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45d09a8a // ummla z10.s, z20.b, z16.b\n" + ".inst 0x45c79a8e // ummla z14.s, z20.b, z7.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n" + ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "bgt 8b\n" "9:" // Height 1: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n" + ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n" + ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n" + ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n" + ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n" "addvl x10, x10, #8\n" "ble 10f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n" + ".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n" + ".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n" + ".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n" + ".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n" "addvl x10, x10, #8\n" "10:" // Height 1: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -258,21 +258,21 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 14f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x9, x20, LSL #2\n" + "ld1w { z18.s }, p4/Z, [x9]\n" + "ld1w { z2.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" + "ld1w { z12.s }, p4/Z, [x20]\n" + "zip1 z8.d, z18.d, z12.d\n" + "zip2 z12.d, z18.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x20, #2, MUL VL]\n" + "zip1 z9.d, z2.d, z13.d\n" + "zip2 z13.d, z2.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x20, #3, MUL VL]\n" + "zip1 z10.d, z17.d, z14.d\n" + "zip2 z14.d, z17.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" "b 15f\n" @@ -290,12 +290,12 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "16:" // Height 2: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 17f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" "cbnz x28, 18f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -303,95 +303,95 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "b 18f\n" "17:" // Height 2: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" + "add x25, x26, x21\n" "18:" // Height 2: input setup done "cmp x27, #0x10\n" "ble 20f\n" "19:" // Height 2: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z20.b }, p0/Z, [x26]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z20.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n" + ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n" + ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n" + ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" + "trn2 z20.d, z20.d, z19.d\n" + ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n" + ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-8, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45d19a88 // ummla z8.s, z20.b, z17.b\n" + ".inst 0x45d09a8c // ummla z12.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45d19a89 // ummla z9.s, z20.b, z17.b\n" + ".inst 0x45d09a8d // ummla z13.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45d19a8a // ummla z10.s, z20.b, z17.b\n" + ".inst 0x45d09a8e // ummla z14.s, z20.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #-2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #-1, MUL VL]\n" "sub x27, x27, #0x10\n" "cmp x27, #0x10\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + ".inst 0x45d19a8b // ummla z11.s, z20.b, z17.b\n" + ".inst 0x45d09a8f // ummla z15.s, z20.b, z16.b\n" "add x26, x26, #0x10\n" "add x25, x25, #0x10\n" "bgt 19b\n" "20:" // Height 2: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1rqb { z19.b }, p0/Z, [x25]\n" + "trn1 z18.d, z1.d, z19.d\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19a48 // ummla z8.s, z18.b, z17.b\n" + ".inst 0x45d09a4c // ummla z12.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19a49 // ummla z9.s, z18.b, z17.b\n" + ".inst 0x45d09a4d // ummla z13.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d19a4a // ummla z10.s, z18.b, z17.b\n" + ".inst 0x45d09a4e // ummla z14.s, z18.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" "subs x27, x27, #0x8\n" - "trn2 z1.d, z1.d, z2.d\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" + "trn2 z1.d, z1.d, z19.d\n" + ".inst 0x45d19a4b // ummla z11.s, z18.b, z17.b\n" + ".inst 0x45d09a4f // ummla z15.s, z18.b, z16.b\n" "addvl x10, x10, #8\n" "ble 21f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" + "ld1b { z17.b }, p5/Z, [x10]\n" + "ld1b { z16.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d19828 // ummla z8.s, z1.b, z17.b\n" + ".inst 0x45d0982c // ummla z12.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d19829 // ummla z9.s, z1.b, z17.b\n" + ".inst 0x45d0982d // ummla z13.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d1982a // ummla z10.s, z1.b, z17.b\n" + ".inst 0x45d0982e // ummla z14.s, z1.b, z16.b\n" + "ld1b { z17.b }, p5/Z, [x10, #6, MUL VL]\n" + "ld1b { z16.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d1982b // ummla z11.s, z1.b, z17.b\n" + ".inst 0x45d0982f // ummla z15.s, z1.b, z16.b\n" "addvl x10, x10, #8\n" "21:" // Height 2: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" @@ -399,24 +399,24 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "cmp x28, x20\n" "bne 16b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x20, x9, x20, LSL #2\n" + "uzp1 z16.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" + "uzp1 z17.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" - "uzp1 z13.d, z10.d, z14.d\n" + "st1w { z16.s }, p4, [x9]\n" + "uzp1 z16.d, z10.d, z14.d\n" "uzp2 z10.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" - "uzp1 z14.d, z11.d, z15.d\n" + "st1w { z17.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z2.d, z11.d, z15.d\n" "uzp2 z11.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "st1w { z16.s }, p2, [x9, #2, MUL VL]\n" + "st1w { z2.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" - "st1w { z8.s }, p4, [x24]\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z8.s }, p4, [x20]\n" + "st1w { z9.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x20, #3, MUL VL]\n" "22:" // Height 2: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -437,28 +437,28 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 25f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x21]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x21, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x20]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" "zip1 z17.d, z18.d, z21.d\n" @@ -490,13 +490,13 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "27:" // Height 3: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 28f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" "cbnz x28, 29f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -505,169 +505,169 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "b 29f\n" "28:" // Height 3: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" "29:" // Height 3: input setup done "cmp x27, #0x10\n" "ble 31f\n" "30:" // Height 3: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "trn1 z27.d, z30.d, z24.d\n" + "trn2 z30.d, z30.d, z24.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "trn1 z26.d, z28.d, z29.d\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n" + ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n" + ".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n" + ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n" + ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z29.d\n" + ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n" + ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n" + ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n" + ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n" + ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n" + ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n" + ".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n" + ".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n" + ".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n" + ".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n" + ".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n" + ".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n" + ".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n" + ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n" + ".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n" "bgt 30b\n" "31:" // Height 3: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn1 z27.d, z1.d, z24.d\n" + "trn2 z1.d, z1.d, z24.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "trn1 z26.d, z3.d, z28.d\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99b68 // ummla z8.s, z27.b, z25.b\n" + ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n" + ".inst 0x45d89b6c // ummla z12.s, z27.b, z24.b\n" + ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99b69 // ummla z9.s, z27.b, z25.b\n" + ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d89b6d // ummla z13.s, z27.b, z24.b\n" + ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z28.d\n" + ".inst 0x45d99b6a // ummla z10.s, z27.b, z25.b\n" + ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d89b6e // ummla z14.s, z27.b, z24.b\n" + ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45d99b6b // ummla z11.s, z27.b, z25.b\n" + ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n" + ".inst 0x45d89b6f // ummla z15.s, z27.b, z24.b\n" + ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n" "ble 32f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n" + ".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n" + ".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n" + ".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n" + ".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45d8982d // ummla z13.s, z1.b, z24.b\n" + ".inst 0x45d89875 // ummla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d9982a // ummla z10.s, z1.b, z25.b\n" + ".inst 0x45d99872 // ummla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d8982e // ummla z14.s, z1.b, z24.b\n" + ".inst 0x45d89876 // ummla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45d9982b // ummla z11.s, z1.b, z25.b\n" + ".inst 0x45d99873 // ummla z19.s, z3.b, z25.b\n" + ".inst 0x45d8982f // ummla z15.s, z1.b, z24.b\n" + ".inst 0x45d89877 // ummla z23.s, z3.b, z24.b\n" "32:" // Height 3: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 27b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x21, x9, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" + "uzp1 z25.d, z8.d, z12.d\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z24.d, z9.d, z13.d\n" + "st1w { z25.s }, p4, [x9]\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z25.d, z10.d, z14.d\n" + "st1w { z24.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z24.d, z11.d, z15.d\n" + "st1w { z25.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" "uzp1 z16.d, z16.d, z20.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "st1w { z24.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp1 z17.d, z17.d, z21.d\n" "uzp1 z18.d, z18.d, z22.d\n" - "st1w { z8.s }, p4, [x24]\n" + "st1w { z8.s }, p4, [x21]\n" "uzp1 z19.d, z19.d, z23.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x23]\n" - "st1w { z17.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x23, #3, MUL VL]\n" + "st1w { z9.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z10.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z11.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "33:" // Height 3: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -688,37 +688,37 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 36f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" - "add x22, x23, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x22]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x22, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x21]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z24.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x20]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z23.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" @@ -746,14 +746,14 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "38:" // Height 4: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 39f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" "cbnz x28, 40f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -763,182 +763,182 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "b 40f\n" "39:" // Height 4: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" "40:" // Height 4: input setup done "cmp x27, #0x10\n" "ble 42f\n" "41:" // Height 4: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" + "ld1rqb { z30.b }, p0/Z, [x26]\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z29.d, z30.d, z24.d\n" + "ld1rqb { z28.b }, p0/Z, [x24]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z30.d, z30.d, z24.d\n" + "trn1 z26.d, z28.d, z27.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99ba8 // ummla z8.s, z29.b, z25.b\n" + ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n" + ".inst 0x45d89bac // ummla z12.s, z29.b, z24.b\n" + ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99ba9 // ummla z9.s, z29.b, z25.b\n" + ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + "trn2 z28.d, z28.d, z27.d\n" + ".inst 0x45d89bad // ummla z13.s, z29.b, z24.b\n" + ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d99baa // ummla z10.s, z29.b, z25.b\n" + ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" "cmp x27, #0x10\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d89bae // ummla z14.s, z29.b, z24.b\n" + ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45d99bab // ummla z11.s, z29.b, z25.b\n" + ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-8, MUL VL]\n" "add x26, x26, #0x10\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45d89baf // ummla z15.s, z29.b, z24.b\n" + ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-7, MUL VL]\n" "add x25, x25, #0x10\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45d99bc8 // ummla z8.s, z30.b, z25.b\n" + ".inst 0x45d99b90 // ummla z16.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-6, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45d89bcc // ummla z12.s, z30.b, z24.b\n" + ".inst 0x45d89b94 // ummla z20.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-5, MUL VL]\n" "add x23, x23, #0x10\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45d99bc9 // ummla z9.s, z30.b, z25.b\n" + ".inst 0x45d99b91 // ummla z17.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45d89bcd // ummla z13.s, z30.b, z24.b\n" + ".inst 0x45d89b95 // ummla z21.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45d99bca // ummla z10.s, z30.b, z25.b\n" + ".inst 0x45d99b92 // ummla z18.s, z28.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45d89bce // ummla z14.s, z30.b, z24.b\n" + ".inst 0x45d89b96 // ummla z22.s, z28.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45d99bcb // ummla z11.s, z30.b, z25.b\n" + ".inst 0x45d99b93 // ummla z19.s, z28.b, z25.b\n" + ".inst 0x45d89bcf // ummla z15.s, z30.b, z24.b\n" + ".inst 0x45d89b97 // ummla z23.s, z28.b, z24.b\n" "bgt 41b\n" "42:" // Height 4: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z24.b }, p0/Z, [x25]\n" + "trn1 z28.d, z1.d, z24.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + "ld1rqb { z27.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z24.d\n" + "trn1 z26.d, z3.d, z27.d\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99b88 // ummla z8.s, z28.b, z25.b\n" + ".inst 0x45d99b50 // ummla z16.s, z26.b, z25.b\n" + ".inst 0x45d89b8c // ummla z12.s, z28.b, z24.b\n" + ".inst 0x45d89b54 // ummla z20.s, z26.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99b89 // ummla z9.s, z28.b, z25.b\n" + ".inst 0x45d99b51 // ummla z17.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - "trn2 z3.d, z3.d, z4.d\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45d89b8d // ummla z13.s, z28.b, z24.b\n" + ".inst 0x45d89b55 // ummla z21.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + "trn2 z3.d, z3.d, z27.d\n" + ".inst 0x45d99b8a // ummla z10.s, z28.b, z25.b\n" + ".inst 0x45d99b52 // ummla z18.s, z26.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d89b8e // ummla z14.s, z28.b, z24.b\n" + ".inst 0x45d89b56 // ummla z22.s, z26.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" + ".inst 0x45d99b8b // ummla z11.s, z28.b, z25.b\n" + ".inst 0x45d99b53 // ummla z19.s, z26.b, z25.b\n" + ".inst 0x45d89b8f // ummla z15.s, z28.b, z24.b\n" + ".inst 0x45d89b57 // ummla z23.s, z26.b, z24.b\n" "ble 43f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z25.b }, p5/Z, [x10]\n" + "ld1b { z24.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45d99828 // ummla z8.s, z1.b, z25.b\n" + ".inst 0x45d99870 // ummla z16.s, z3.b, z25.b\n" + ".inst 0x45d8982c // ummla z12.s, z1.b, z24.b\n" + ".inst 0x45d89874 // ummla z20.s, z3.b, z24.b\n" + "ld1b { z25.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1b { z24.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45d99829 // ummla z9.s, z1.b, z25.b\n" + ".inst 0x45d99871 // ummla z17.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45d8982d // ummla z13.s, z1.b, z24.b\n" + ".inst 0x45d89875 // ummla z21.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45d9982a // ummla z10.s, z1.b, z25.b\n" + ".inst 0x45d99872 // ummla z18.s, z3.b, z25.b\n" + "ld1b { z25.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45d8982e // ummla z14.s, z1.b, z24.b\n" + ".inst 0x45d89876 // ummla z22.s, z3.b, z24.b\n" + "ld1b { z24.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" + ".inst 0x45d9982b // ummla z11.s, z1.b, z25.b\n" + ".inst 0x45d99873 // ummla z19.s, z3.b, z25.b\n" + ".inst 0x45d8982f // ummla z15.s, z1.b, z24.b\n" + ".inst 0x45d89877 // ummla z23.s, z3.b, z24.b\n" "43:" // Height 4: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 38b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" - "add x22, x23, x20, LSL #2\n" + "add x22, x9, x20, LSL #2\n" + "add x21, x22, x20, LSL #2\n" + "uzp1 z25.d, z8.d, z12.d\n" + "add x20, x21, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z24.d, z9.d, z13.d\n" + "st1w { z25.s }, p4, [x9]\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z25.d, z10.d, z14.d\n" + "st1w { z24.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z24.d, z11.d, z15.d\n" + "st1w { z25.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" - "uzp1 z15.d, z16.d, z20.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "uzp1 z25.d, z16.d, z20.d\n" + "st1w { z24.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp2 z16.d, z16.d, z20.d\n" - "uzp1 z20.d, z17.d, z21.d\n" - "st1w { z8.s }, p4, [x24]\n" + "uzp1 z24.d, z17.d, z21.d\n" + "st1w { z8.s }, p4, [x22]\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "st1w { z9.s }, p3, [x22, #1, MUL VL]\n" "uzp2 z18.d, z18.d, z22.d\n" - "uzp1 z22.d, z19.d, z23.d\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "uzp1 z20.d, z19.d, z23.d\n" + "st1w { z10.s }, p2, [x22, #2, MUL VL]\n" "uzp2 z19.d, z19.d, z23.d\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" - "st1w { z15.s }, p4, [x23]\n" - "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x22]\n" - "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z11.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z25.s }, p4, [x21]\n" + "st1w { z24.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x20]\n" + "st1w { z17.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x20, #3, MUL VL]\n" "44:" // Height 4: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -959,54 +959,54 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "whilelt p1.s, x20, x11\n" "tbz %x[flags], #0, 47f\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "ld1w { z19.s }, p4/Z, [x9]\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "add x20, x21, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" - "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" - "zip2 z12.d, z9.d, z12.d\n" - "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" - "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip1 z9.d, z10.d, z13.d\n" - "zip2 z13.d, z10.d, z13.d\n" - "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" - "ld1w { z17.s }, p4/Z, [x23]\n" - "zip1 z10.d, z11.d, z14.d\n" - "zip2 z14.d, z11.d, z14.d\n" - "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" - "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" + "ld1w { z12.s }, p4/Z, [x23]\n" + "zip1 z8.d, z19.d, z12.d\n" + "zip2 z12.d, z19.d, z12.d\n" + "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n" + "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n" + "zip1 z9.d, z17.d, z13.d\n" + "zip2 z13.d, z17.d, z13.d\n" + "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n" + "ld1w { z17.s }, p4/Z, [x22]\n" + "zip1 z10.d, z18.d, z14.d\n" + "zip2 z14.d, z18.d, z14.d\n" + "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n" + "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n" "zip1 z11.d, z16.d, z15.d\n" "zip2 z15.d, z16.d, z15.d\n" - "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" - "ld1w { z20.s }, p4/Z, [x22]\n" + "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n" + "ld1w { z20.s }, p4/Z, [x21]\n" "zip1 z16.d, z17.d, z20.d\n" "zip2 z20.d, z17.d, z20.d\n" - "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n" - "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n" + "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n" + "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n" "zip1 z17.d, z18.d, z21.d\n" "zip2 z21.d, z18.d, z21.d\n" - "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n" - "ld1w { z25.s }, p4/Z, [x21]\n" + "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z25.s }, p4/Z, [x20]\n" "zip1 z18.d, z19.d, z22.d\n" "zip2 z22.d, z19.d, z22.d\n" - "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n" - "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" + "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n" + "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n" "zip1 z19.d, z24.d, z23.d\n" "zip2 z23.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "zip1 z24.d, z25.d, z28.d\n" "zip2 z28.d, z25.d, z28.d\n" "zip1 z25.d, z26.d, z29.d\n" "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 48f\n" "47:" // Height 5: no accumulate "mov z8.s, #0x0\n" @@ -1038,15 +1038,15 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "49:" // Height 5: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 50f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" "cbnz x28, 51f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1057,231 +1057,231 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "b 51f\n" "50:" // Height 5: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" "51:" // Height 5: input setup done "cmp x27, #0x10\n" "ble 53f\n" "52:" // Height 5: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z6.b }, p0/Z, [x26]\n" + "ld1rqb { z1.b }, p0/Z, [x25]\n" + "ld1rqb { z7.b }, p0/Z, [x24]\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z5.d, z6.d, z1.d\n" + "trn2 z6.d, z6.d, z1.d\n" + "ld1rqb { z4.b }, p0/Z, [x22]\n" + "trn1 z3.d, z7.d, z2.d\n" + "trn2 z7.d, z7.d, z2.d\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "trn1 z2.d, z4.d, z0.d\n" + "trn2 z4.d, z4.d, z0.d\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c198a8 // ummla z8.s, z5.b, z1.b\n" + ".inst 0x45c19870 // ummla z16.s, z3.b, z1.b\n" + ".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c098ac // ummla z12.s, z5.b, z0.b\n" + ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c198a9 // ummla z9.s, z5.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c19871 // ummla z17.s, z3.b, z1.b\n" + ".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c098ad // ummla z13.s, z5.b, z0.b\n" + ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c198aa // ummla z10.s, z5.b, z1.b\n" + ".inst 0x45c19872 // ummla z18.s, z3.b, z1.b\n" + ".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c098ae // ummla z14.s, z5.b, z0.b\n" + ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n" + ".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" - ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + ".inst 0x45c198ab // ummla z11.s, z5.b, z1.b\n" + ".inst 0x45c19873 // ummla z19.s, z3.b, z1.b\n" + ".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45c098af // ummla z15.s, z5.b, z0.b\n" + ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n" + ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n" + ".inst 0x45c198f0 // ummla z16.s, z7.b, z1.b\n" + ".inst 0x45c19898 // ummla z24.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n" + ".inst 0x45c098f4 // ummla z20.s, z7.b, z0.b\n" + ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n" + ".inst 0x45c198f1 // ummla z17.s, z7.b, z1.b\n" + ".inst 0x45c19899 // ummla z25.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n" + ".inst 0x45c098f5 // ummla z21.s, z7.b, z0.b\n" + ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n" + ".inst 0x45c198f2 // ummla z18.s, z7.b, z1.b\n" + ".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n" + ".inst 0x45c098f6 // ummla z22.s, z7.b, z0.b\n" + ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n" + ".inst 0x45c198f3 // ummla z19.s, z7.b, z1.b\n" + ".inst 0x45c1989b // ummla z27.s, z4.b, z1.b\n" + ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n" + ".inst 0x45c098f7 // ummla z23.s, z7.b, z0.b\n" + ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n" "bgt 52b\n" "53:" // Height 5: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" + "ld1rqb { z4.b }, p0/Z, [x25]\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn1 z0.d, z1.d, z2.d\n" - "trn2 z1.d, z1.d, z2.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn1 z7.d, z1.d, z4.d\n" + "trn2 z1.d, z1.d, z4.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "trn1 z2.d, z3.d, z4.d\n" - "trn2 z3.d, z3.d, z4.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "trn1 z6.d, z3.d, z2.d\n" + "trn2 z3.d, z3.d, z2.d\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c298e8 // ummla z8.s, z7.b, z2.b\n" + ".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n" + ".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n" + ".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n" + ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c298e9 // ummla z9.s, z7.b, z2.b\n" + ".inst 0x45c298d1 // ummla z17.s, z6.b, z2.b\n" + ".inst 0x45c29899 // ummla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n" + ".inst 0x45c098d5 // ummla z21.s, z6.b, z0.b\n" + ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c298ea // ummla z10.s, z7.b, z2.b\n" + ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n" + ".inst 0x45c2989a // ummla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n" + ".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n" + ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n" "addvl x10, x10, #8\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + ".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n" + ".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n" + ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n" + ".inst 0x45c098d7 // ummla z23.s, z6.b, z0.b\n" + ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n" "ble 54f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n" + ".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n" + ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n" + ".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n" + ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n" + ".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n" + ".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n" + ".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n" + ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n" + ".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n" + ".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n" + ".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n" + ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" - ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + ".inst 0x45c2982b // ummla z11.s, z1.b, z2.b\n" + ".inst 0x45c29873 // ummla z19.s, z3.b, z2.b\n" + ".inst 0x45c298bb // ummla z27.s, z5.b, z2.b\n" + ".inst 0x45c0982f // ummla z15.s, z1.b, z0.b\n" + ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n" + ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n" "54:" // Height 5: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" "cmp x28, x20\n" "bne 49b\n" "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "add x24, x9, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "add x23, x9, x20, LSL #2\n" "add x22, x23, x20, LSL #2\n" + "uzp1 z2.d, z8.d, z12.d\n" "add x21, x22, x20, LSL #2\n" + "add x20, x21, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" - "uzp1 z12.d, z9.d, z13.d\n" + "uzp1 z1.d, z9.d, z13.d\n" "uzp2 z9.d, z9.d, z13.d\n" - "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z7.s }, p4, [x9]\n" + "uzp1 z0.d, z10.d, z14.d\n" + "st1w { z2.s }, p4, [x9]\n" "uzp2 z10.d, z10.d, z14.d\n" - "uzp1 z14.d, z11.d, z15.d\n" - "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" + "uzp1 z2.d, z11.d, z15.d\n" + "st1w { z1.s }, p3, [x9, #1, MUL VL]\n" "uzp2 z11.d, z11.d, z15.d\n" - "uzp1 z15.d, z16.d, z20.d\n" - "st1w { z13.s }, p2, [x9, #2, MUL VL]\n" + "uzp1 z1.d, z16.d, z20.d\n" + "st1w { z0.s }, p2, [x9, #2, MUL VL]\n" "uzp2 z16.d, z16.d, z20.d\n" - "uzp1 z20.d, z17.d, z21.d\n" - "st1w { z14.s }, p1, [x9, #3, MUL VL]\n" + "uzp1 z0.d, z17.d, z21.d\n" + "st1w { z2.s }, p1, [x9, #3, MUL VL]\n" "addvl x9, x9, #4\n" "uzp2 z17.d, z17.d, z21.d\n" "uzp1 z21.d, z18.d, z22.d\n" - "st1w { z8.s }, p4, [x24]\n" + "st1w { z8.s }, p4, [x23]\n" "uzp2 z18.d, z18.d, z22.d\n" - "uzp1 z22.d, z19.d, z23.d\n" - "st1w { z9.s }, p3, [x24, #1, MUL VL]\n" + "uzp1 z20.d, z19.d, z23.d\n" + "st1w { z9.s }, p3, [x23, #1, MUL VL]\n" "uzp2 z19.d, z19.d, z23.d\n" "uzp1 z24.d, z24.d, z28.d\n" - "st1w { z10.s }, p2, [x24, #2, MUL VL]\n" + "st1w { z10.s }, p2, [x23, #2, MUL VL]\n" "uzp1 z25.d, z25.d, z29.d\n" "uzp1 z26.d, z26.d, z30.d\n" - "st1w { z11.s }, p1, [x24, #3, MUL VL]\n" + "st1w { z11.s }, p1, [x23, #3, MUL VL]\n" "uzp1 z27.d, z27.d, z31.d\n" - "st1w { z15.s }, p4, [x23]\n" - "st1w { z20.s }, p3, [x23, #1, MUL VL]\n" - "st1w { z21.s }, p2, [x23, #2, MUL VL]\n" - "st1w { z22.s }, p1, [x23, #3, MUL VL]\n" - "st1w { z16.s }, p4, [x22]\n" - "st1w { z17.s }, p3, [x22, #1, MUL VL]\n" - "st1w { z18.s }, p2, [x22, #2, MUL VL]\n" - "st1w { z19.s }, p1, [x22, #3, MUL VL]\n" - "st1w { z24.s }, p4, [x21]\n" - "st1w { z25.s }, p3, [x21, #1, MUL VL]\n" - "st1w { z26.s }, p2, [x21, #2, MUL VL]\n" - "st1w { z27.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z1.s }, p4, [x22]\n" + "st1w { z0.s }, p3, [x22, #1, MUL VL]\n" + "st1w { z21.s }, p2, [x22, #2, MUL VL]\n" + "st1w { z20.s }, p1, [x22, #3, MUL VL]\n" + "st1w { z16.s }, p4, [x21]\n" + "st1w { z17.s }, p3, [x21, #1, MUL VL]\n" + "st1w { z18.s }, p2, [x21, #2, MUL VL]\n" + "st1w { z19.s }, p1, [x21, #3, MUL VL]\n" + "st1w { z24.s }, p4, [x20]\n" + "st1w { z25.s }, p3, [x20, #1, MUL VL]\n" + "st1w { z26.s }, p2, [x20, #2, MUL VL]\n" + "st1w { z27.s }, p1, [x20, #3, MUL VL]\n" "55:" // Height 5: Writeback done "decw x11, ALL, MUL #4\n" "cmp x11, XZR\n" @@ -1307,26 +1307,26 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" - "ld1w { z9.s }, p4/Z, [x9]\n" + "ld1w { z17.s }, p4/Z, [x9]\n" "add x22, x23, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" - "ld1w { z10.s }, p3/Z, [x9, #1, MUL VL]\n" - "ld1w { z11.s }, p2/Z, [x9, #2, MUL VL]\n" + "ld1w { z18.s }, p3/Z, [x9, #1, MUL VL]\n" + "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n" "add x20, x21, x20, LSL #2\n" "ld1w { z16.s }, p1/Z, [x9, #3, MUL VL]\n" "ld1w { z12.s }, p4/Z, [x24]\n" - "zip1 z8.d, z9.d, z12.d\n" + "zip1 z8.d, z17.d, z12.d\n" "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n" "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n" - "zip2 z12.d, z9.d, z12.d\n" - "zip1 z9.d, z10.d, z13.d\n" + "zip2 z12.d, z17.d, z12.d\n" + "zip1 z9.d, z18.d, z13.d\n" "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n" "ld1w { z17.s }, p4/Z, [x23]\n" - "zip2 z13.d, z10.d, z13.d\n" - "zip1 z10.d, z11.d, z14.d\n" + "zip2 z13.d, z18.d, z13.d\n" + "zip1 z10.d, z20.d, z14.d\n" "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n" "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n" - "zip2 z14.d, z11.d, z14.d\n" + "zip2 z14.d, z20.d, z14.d\n" "zip1 z11.d, z16.d, z15.d\n" "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n" "ld1w { z20.s }, p4/Z, [x22]\n" @@ -1344,7 +1344,7 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n" "zip2 z22.d, z19.d, z22.d\n" "zip1 z19.d, z24.d, z23.d\n" - "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n" + "ld1w { z0.s }, p1/Z, [x21, #3, MUL VL]\n" "ld1w { z28.s }, p4/Z, [x20]\n" "zip2 z23.d, z24.d, z23.d\n" "zip1 z24.d, z25.d, z28.d\n" @@ -1356,8 +1356,8 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "zip2 z29.d, z26.d, z29.d\n" "zip1 z26.d, z27.d, z30.d\n" "zip2 z30.d, z27.d, z30.d\n" - "zip1 z27.d, z6.d, z31.d\n" - "zip2 z31.d, z6.d, z31.d\n" + "zip1 z27.d, z0.d, z31.d\n" + "zip2 z31.d, z0.d, z31.d\n" "b 59f\n" "58:" // Height 6: no accumulate "mov z8.s, #0x0\n" @@ -1389,16 +1389,16 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "60:" // Height 6: String loop "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" "ldr w27, [x20, x28, LSL #0x2]\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" "tbz %x[flags], #3, 61f\n" - "ldr x21, [%x[input_ptr], x28, LSL #0x3]\n" - "add x21, x21, x20, LSL #3\n" - "ldr x26, [x21, #0x0]\n" - "ldr x25, [x21, #0x8]\n" - "ldr x24, [x21, #0x10]\n" - "ldr x23, [x21, #0x18]\n" - "ldr x22, [x21, #0x20]\n" - "ldr x21, [x21, #0x28]\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" "cbnz x28, 62f\n" "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" "add x26, x26, x20\n" @@ -1410,184 +1410,184 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "b 62f\n" "61:" // Height 6: setup direct input "mov x26, %x[input_ptr]\n" - "add x25, x26, x20\n" - "add x24, x25, x20\n" - "add x23, x24, x20\n" - "add x22, x23, x20\n" - "add x21, x22, x20\n" + "add x25, x26, x21\n" + "add x24, x25, x21\n" + "add x23, x24, x21\n" + "add x22, x23, x21\n" + "add x21, x22, x21\n" "62:" // Height 6: input setup done "cmp x27, #0x10\n" "ble 64f\n" "63:" // Height 6: Multiply loop: Main loop head "whilelt p0.b, XZR, x27\n" - "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" - "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" - "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z7.b }, p0/Z, [x26]\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z6.d, z7.d, z0.d\n" + "ld1rqb { z5.b }, p0/Z, [x24]\n" + "ld1rqb { z1.b }, p0/Z, [x23]\n" + "trn2 z7.d, z7.d, z0.d\n" + "trn1 z4.d, z5.d, z1.d\n" + "ld1rqb { z3.b }, p0/Z, [x22]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z5.d, z5.d, z1.d\n" + "trn1 z2.d, z3.d, z0.d\n" + "trn2 z3.d, z3.d, z0.d\n" + "ld1b { z1.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c198c8 // ummla z8.s, z6.b, z1.b\n" + ".inst 0x45c19890 // ummla z16.s, z4.b, z1.b\n" + ".inst 0x45c19858 // ummla z24.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #2, MUL VL]\n" "sub x27, x27, #0x10\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" + ".inst 0x45c098cc // ummla z12.s, z6.b, z0.b\n" + ".inst 0x45c09894 // ummla z20.s, z4.b, z0.b\n" "cmp x27, #0x10\n" "add x26, x26, #0x10\n" - ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c0985c // ummla z28.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c198c9 // ummla z9.s, z6.b, z1.b\n" "add x25, x25, #0x10\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c19891 // ummla z17.s, z4.b, z1.b\n" + ".inst 0x45c19859 // ummla z25.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #4, MUL VL]\n" "add x24, x24, #0x10\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" + ".inst 0x45c098cd // ummla z13.s, z6.b, z0.b\n" + ".inst 0x45c09895 // ummla z21.s, z4.b, z0.b\n" "add x23, x23, #0x10\n" "add x22, x22, #0x10\n" - ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" + ".inst 0x45c0985d // ummla z29.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c198ca // ummla z10.s, z6.b, z1.b\n" "add x21, x21, #0x10\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45c19892 // ummla z18.s, z4.b, z1.b\n" + ".inst 0x45c1985a // ummla z26.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c098ce // ummla z14.s, z6.b, z0.b\n" + ".inst 0x45c09896 // ummla z22.s, z4.b, z0.b\n" + ".inst 0x45c0985e // ummla z30.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #16\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-8, MUL VL]\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-7, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-6, MUL VL]\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-5, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-4, MUL VL]\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-3, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #-2, MUL VL]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #-1, MUL VL]\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" - ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + ".inst 0x45c198cb // ummla z11.s, z6.b, z1.b\n" + ".inst 0x45c19893 // ummla z19.s, z4.b, z1.b\n" + ".inst 0x45c1985b // ummla z27.s, z2.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-8, MUL VL]\n" + ".inst 0x45c098cf // ummla z15.s, z6.b, z0.b\n" + ".inst 0x45c09897 // ummla z23.s, z4.b, z0.b\n" + ".inst 0x45c0985f // ummla z31.s, z2.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-7, MUL VL]\n" + ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n" + ".inst 0x45c198b0 // ummla z16.s, z5.b, z1.b\n" + ".inst 0x45c19878 // ummla z24.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-6, MUL VL]\n" + ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n" + ".inst 0x45c098b4 // ummla z20.s, z5.b, z0.b\n" + ".inst 0x45c0987c // ummla z28.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-5, MUL VL]\n" + ".inst 0x45c198e9 // ummla z9.s, z7.b, z1.b\n" + ".inst 0x45c198b1 // ummla z17.s, z5.b, z1.b\n" + ".inst 0x45c19879 // ummla z25.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-4, MUL VL]\n" + ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n" + ".inst 0x45c098b5 // ummla z21.s, z5.b, z0.b\n" + ".inst 0x45c0987d // ummla z29.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-3, MUL VL]\n" + ".inst 0x45c198ea // ummla z10.s, z7.b, z1.b\n" + ".inst 0x45c198b2 // ummla z18.s, z5.b, z1.b\n" + ".inst 0x45c1987a // ummla z26.s, z3.b, z1.b\n" + "ld1b { z1.b }, p5/Z, [x10, #-2, MUL VL]\n" + ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n" + ".inst 0x45c098b6 // ummla z22.s, z5.b, z0.b\n" + ".inst 0x45c0987e // ummla z30.s, z3.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #-1, MUL VL]\n" + ".inst 0x45c198eb // ummla z11.s, z7.b, z1.b\n" + ".inst 0x45c198b3 // ummla z19.s, z5.b, z1.b\n" + ".inst 0x45c1987b // ummla z27.s, z3.b, z1.b\n" + ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n" + ".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n" + ".inst 0x45c0987f // ummla z31.s, z3.b, z0.b\n" "bgt 63b\n" "64:" // Height 6: Multiply loop: Single iteration only "whilelt p0.b, XZR, x27\n" "ld1rqb { z1.b }, p0/Z, [x26]\n" - "ld1rqb { z2.b }, p0/Z, [x25]\n" - "trn1 z0.d, z1.d, z2.d\n" + "ld1rqb { z0.b }, p0/Z, [x25]\n" + "trn1 z7.d, z1.d, z0.d\n" "ld1rqb { z3.b }, p0/Z, [x24]\n" - "ld1rqb { z4.b }, p0/Z, [x23]\n" - "trn2 z1.d, z1.d, z2.d\n" - "trn1 z2.d, z3.d, z4.d\n" + "ld1rqb { z2.b }, p0/Z, [x23]\n" + "trn2 z1.d, z1.d, z0.d\n" + "trn1 z6.d, z3.d, z2.d\n" "ld1rqb { z5.b }, p0/Z, [x22]\n" - "ld1rqb { z6.b }, p0/Z, [x21]\n" - "trn2 z3.d, z3.d, z4.d\n" - "trn1 z4.d, z5.d, z6.d\n" - "trn2 z5.d, z5.d, z6.d\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79808 // ummla z8.s, z0.b, z7.b\n" - ".inst 0x45c79850 // ummla z16.s, z2.b, z7.b\n" - ".inst 0x45c79898 // ummla z24.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" + "ld1rqb { z0.b }, p0/Z, [x21]\n" + "trn2 z3.d, z3.d, z2.d\n" + "trn1 z4.d, z5.d, z0.d\n" + "trn2 z5.d, z5.d, z0.d\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c298e8 // ummla z8.s, z7.b, z2.b\n" + ".inst 0x45c298d0 // ummla z16.s, z6.b, z2.b\n" + ".inst 0x45c29898 // ummla z24.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" "subs x27, x27, #0x8\n" - ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - ".inst 0x45c6989c // ummla z28.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" - ".inst 0x45c79851 // ummla z17.s, z2.b, z7.b\n" - ".inst 0x45c79899 // ummla z25.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c6980d // ummla z13.s, z0.b, z6.b\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - ".inst 0x45c6989d // ummla z29.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7980a // ummla z10.s, z0.b, z7.b\n" - ".inst 0x45c79852 // ummla z18.s, z2.b, z7.b\n" - ".inst 0x45c7989a // ummla z26.s, z4.b, z7.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c6980e // ummla z14.s, z0.b, z6.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c6989e // ummla z30.s, z4.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" + ".inst 0x45c098ec // ummla z12.s, z7.b, z0.b\n" + ".inst 0x45c098d4 // ummla z20.s, z6.b, z0.b\n" + ".inst 0x45c0989c // ummla z28.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c298e9 // ummla z9.s, z7.b, z2.b\n" + ".inst 0x45c298d1 // ummla z17.s, z6.b, z2.b\n" + ".inst 0x45c29899 // ummla z25.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n" + ".inst 0x45c098d5 // ummla z21.s, z6.b, z0.b\n" + ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c298ea // ummla z10.s, z7.b, z2.b\n" + ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n" + ".inst 0x45c2989a // ummla z26.s, z4.b, z2.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c098ee // ummla z14.s, z7.b, z0.b\n" + ".inst 0x45c098d6 // ummla z22.s, z6.b, z0.b\n" + ".inst 0x45c0989e // ummla z30.s, z4.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" + ".inst 0x45c298eb // ummla z11.s, z7.b, z2.b\n" "addvl x10, x10, #8\n" - ".inst 0x45c79853 // ummla z19.s, z2.b, z7.b\n" - ".inst 0x45c7989b // ummla z27.s, z4.b, z7.b\n" - ".inst 0x45c6980f // ummla z15.s, z0.b, z6.b\n" - ".inst 0x45c69857 // ummla z23.s, z2.b, z6.b\n" - ".inst 0x45c6989f // ummla z31.s, z4.b, z6.b\n" + ".inst 0x45c298d3 // ummla z19.s, z6.b, z2.b\n" + ".inst 0x45c2989b // ummla z27.s, z4.b, z2.b\n" + ".inst 0x45c098ef // ummla z15.s, z7.b, z0.b\n" + ".inst 0x45c098d7 // ummla z23.s, z6.b, z0.b\n" + ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n" "ble 65f\n" - "ld1b { z7.b }, p5/Z, [x10]\n" - "ld1b { z6.b }, p5/Z, [x10, #1, MUL VL]\n" - ".inst 0x45c79828 // ummla z8.s, z1.b, z7.b\n" - ".inst 0x45c79870 // ummla z16.s, z3.b, z7.b\n" - ".inst 0x45c798b8 // ummla z24.s, z5.b, z7.b\n" - ".inst 0x45c6982c // ummla z12.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #2, MUL VL]\n" - ".inst 0x45c69874 // ummla z20.s, z3.b, z6.b\n" - ".inst 0x45c698bc // ummla z28.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #3, MUL VL]\n" - ".inst 0x45c79829 // ummla z9.s, z1.b, z7.b\n" - ".inst 0x45c79871 // ummla z17.s, z3.b, z7.b\n" - ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" - ".inst 0x45c6982d // ummla z13.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #4, MUL VL]\n" - ".inst 0x45c69875 // ummla z21.s, z3.b, z6.b\n" - ".inst 0x45c698bd // ummla z29.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #5, MUL VL]\n" - ".inst 0x45c7982a // ummla z10.s, z1.b, z7.b\n" - ".inst 0x45c79872 // ummla z18.s, z3.b, z7.b\n" - ".inst 0x45c798ba // ummla z26.s, z5.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - "ld1b { z7.b }, p5/Z, [x10, #6, MUL VL]\n" - ".inst 0x45c69876 // ummla z22.s, z3.b, z6.b\n" - ".inst 0x45c698be // ummla z30.s, z5.b, z6.b\n" - "ld1b { z6.b }, p5/Z, [x10, #7, MUL VL]\n" + "ld1b { z2.b }, p5/Z, [x10]\n" + "ld1b { z0.b }, p5/Z, [x10, #1, MUL VL]\n" + ".inst 0x45c29828 // ummla z8.s, z1.b, z2.b\n" + ".inst 0x45c29870 // ummla z16.s, z3.b, z2.b\n" + ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n" + ".inst 0x45c0982c // ummla z12.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #2, MUL VL]\n" + ".inst 0x45c09874 // ummla z20.s, z3.b, z0.b\n" + ".inst 0x45c098bc // ummla z28.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #3, MUL VL]\n" + ".inst 0x45c29829 // ummla z9.s, z1.b, z2.b\n" + ".inst 0x45c29871 // ummla z17.s, z3.b, z2.b\n" + ".inst 0x45c298b9 // ummla z25.s, z5.b, z2.b\n" + ".inst 0x45c0982d // ummla z13.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #4, MUL VL]\n" + ".inst 0x45c09875 // ummla z21.s, z3.b, z0.b\n" + ".inst 0x45c098bd // ummla z29.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #5, MUL VL]\n" + ".inst 0x45c2982a // ummla z10.s, z1.b, z2.b\n" + ".inst 0x45c29872 // ummla z18.s, z3.b, z2.b\n" + ".inst 0x45c298ba // ummla z26.s, z5.b, z2.b\n" + ".inst 0x45c0982e // ummla z14.s, z1.b, z0.b\n" + "ld1b { z2.b }, p5/Z, [x10, #6, MUL VL]\n" + ".inst 0x45c09876 // ummla z22.s, z3.b, z0.b\n" + ".inst 0x45c098be // ummla z30.s, z5.b, z0.b\n" + "ld1b { z0.b }, p5/Z, [x10, #7, MUL VL]\n" "addvl x10, x10, #8\n" - ".inst 0x45c7982b // ummla z11.s, z1.b, z7.b\n" - ".inst 0x45c79873 // ummla z19.s, z3.b, z7.b\n" - ".inst 0x45c798bb // ummla z27.s, z5.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c69877 // ummla z23.s, z3.b, z6.b\n" - ".inst 0x45c698bf // ummla z31.s, z5.b, z6.b\n" + ".inst 0x45c2982b // ummla z11.s, z1.b, z2.b\n" + ".inst 0x45c29873 // ummla z19.s, z3.b, z2.b\n" + ".inst 0x45c298bb // ummla z27.s, z5.b, z2.b\n" + ".inst 0x45c0982f // ummla z15.s, z1.b, z0.b\n" + ".inst 0x45c09877 // ummla z23.s, z3.b, z0.b\n" + ".inst 0x45c098bf // ummla z31.s, z5.b, z0.b\n" "65:" // Height 6: Multiply loop: multiply skip "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" "add x28, x28, #0x1\n" @@ -1596,7 +1596,7 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" "add x24, x9, x20, LSL #2\n" "add x23, x24, x20, LSL #2\n" - "uzp1 z7.d, z8.d, z12.d\n" + "uzp1 z0.d, z8.d, z12.d\n" "add x22, x23, x20, LSL #2\n" "add x21, x22, x20, LSL #2\n" "uzp2 z8.d, z8.d, z12.d\n" @@ -1604,7 +1604,7 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "add x20, x21, x20, LSL #2\n" "uzp2 z9.d, z9.d, z13.d\n" "uzp1 z13.d, z10.d, z14.d\n" - "st1w { z7.s }, p4, [x9]\n" + "st1w { z0.s }, p4, [x9]\n" "uzp2 z10.d, z10.d, z14.d\n" "uzp1 z14.d, z11.d, z15.d\n" "st1w { z12.s }, p3, [x9, #1, MUL VL]\n" @@ -1664,7 +1664,6 @@ void sve_hybrid_u8u32_mmla_6x4VL ( "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" "b 1b\n" "68:" // Exit - : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr) : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -1672,4 +1671,4 @@ void sve_hybrid_u8u32_mmla_6x4VL ( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp index f5fdf993aa..1ae035c614 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 2; @@ -97,5 +92,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp index 94452929c6..e507bc5551 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_bf16fp32_dot_8x3VL( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,10 +89,10 @@ void sve_interleaved_bf16fp32_dot_8x3VL( "3:" // main loop head ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" ".inst 0x6468408b // bfdot z11.s, z4.h, z0.h[1]\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #32]\n" ".inst 0x6470408e // bfdot z14.s, z4.h, z0.h[2]\n" ".inst 0x64784091 // bfdot z17.s, z4.h, z0.h[3]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #48]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694097 // bfdot z23.s, z4.h, z1.h[1]\n" "sub x20, x20, #0x2\n" @@ -115,35 +119,35 @@ void sve_interleaved_bf16fp32_dot_8x3VL( ".inst 0x646940d9 // bfdot z25.s, z6.h, z1.h[1]\n" ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n" ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n" - "ld1h { z6.h }, p0/Z, [x22, #5, MUL VL]\n" + "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n" "addvl x22, x22, #6\n" - ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n" - ".inst 0x646a408b // bfdot z11.s, z4.h, z2.h[1]\n" + ".inst 0x64634088 // bfdot z8.s, z4.h, z3.h[0]\n" + ".inst 0x646b408b // bfdot z11.s, z4.h, z3.h[1]\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x6472408e // bfdot z14.s, z4.h, z2.h[2]\n" - ".inst 0x647a4091 // bfdot z17.s, z4.h, z2.h[3]\n" - ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n" - ".inst 0x646b4097 // bfdot z23.s, z4.h, z3.h[1]\n" - ".inst 0x6473409a // bfdot z26.s, z4.h, z3.h[2]\n" - ".inst 0x647b409d // bfdot z29.s, z4.h, z3.h[3]\n" + ".inst 0x6473408e // bfdot z14.s, z4.h, z3.h[2]\n" + ".inst 0x647b4091 // bfdot z17.s, z4.h, z3.h[3]\n" + ".inst 0x64674094 // bfdot z20.s, z4.h, z7.h[0]\n" + ".inst 0x646f4097 // bfdot z23.s, z4.h, z7.h[1]\n" + ".inst 0x6477409a // bfdot z26.s, z4.h, z7.h[2]\n" + ".inst 0x647f409d // bfdot z29.s, z4.h, z7.h[3]\n" "ld1h { z4.h }, p0/Z, [x22]\n" - ".inst 0x646240a9 // bfdot z9.s, z5.h, z2.h[0]\n" - ".inst 0x646a40ac // bfdot z12.s, z5.h, z2.h[1]\n" - ".inst 0x647240af // bfdot z15.s, z5.h, z2.h[2]\n" - ".inst 0x647a40b2 // bfdot z18.s, z5.h, z2.h[3]\n" - ".inst 0x646340b5 // bfdot z21.s, z5.h, z3.h[0]\n" - ".inst 0x646b40b8 // bfdot z24.s, z5.h, z3.h[1]\n" - ".inst 0x647340bb // bfdot z27.s, z5.h, z3.h[2]\n" - ".inst 0x647b40be // bfdot z30.s, z5.h, z3.h[3]\n" + ".inst 0x646340a9 // bfdot z9.s, z5.h, z3.h[0]\n" + ".inst 0x646b40ac // bfdot z12.s, z5.h, z3.h[1]\n" + ".inst 0x647340af // bfdot z15.s, z5.h, z3.h[2]\n" + ".inst 0x647b40b2 // bfdot z18.s, z5.h, z3.h[3]\n" + ".inst 0x646740b5 // bfdot z21.s, z5.h, z7.h[0]\n" + ".inst 0x646f40b8 // bfdot z24.s, z5.h, z7.h[1]\n" + ".inst 0x647740bb // bfdot z27.s, z5.h, z7.h[2]\n" + ".inst 0x647f40be // bfdot z30.s, z5.h, z7.h[3]\n" "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x646240ca // bfdot z10.s, z6.h, z2.h[0]\n" - ".inst 0x646a40cd // bfdot z13.s, z6.h, z2.h[1]\n" - ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n" - ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n" - ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n" - ".inst 0x646b40d9 // bfdot z25.s, z6.h, z3.h[1]\n" - ".inst 0x647340dc // bfdot z28.s, z6.h, z3.h[2]\n" - ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n" + ".inst 0x6463404a // bfdot z10.s, z2.h, z3.h[0]\n" + ".inst 0x646b404d // bfdot z13.s, z2.h, z3.h[1]\n" + ".inst 0x64734050 // bfdot z16.s, z2.h, z3.h[2]\n" + ".inst 0x647b4053 // bfdot z19.s, z2.h, z3.h[3]\n" + ".inst 0x64674056 // bfdot z22.s, z2.h, z7.h[0]\n" + ".inst 0x646f4059 // bfdot z25.s, z2.h, z7.h[1]\n" + ".inst 0x6477405c // bfdot z28.s, z2.h, z7.h[2]\n" + ".inst 0x647f405f // bfdot z31.s, z2.h, z7.h[3]\n" "ld1h { z6.h }, p0/Z, [x22, #2, MUL VL]\n" "bge 3b\n" "4:" // main loop skip @@ -174,37 +178,37 @@ void sve_interleaved_bf16fp32_dot_8x3VL( ".inst 0x647140dc // bfdot z28.s, z6.h, z1.h[2]\n" ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n" "cbz x20, 5f\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqh { z4.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1h { z7.h }, p0/Z, [x22]\n" - "ld1h { z4.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x646040e8 // bfdot z8.s, z7.h, z0.h[0]\n" - "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n" - ".inst 0x647040ee // bfdot z14.s, z7.h, z0.h[2]\n" - ".inst 0x647840f1 // bfdot z17.s, z7.h, z0.h[3]\n" - ".inst 0x646140f4 // bfdot z20.s, z7.h, z1.h[0]\n" + "ld1h { z2.h }, p0/Z, [x22]\n" + "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x64644048 // bfdot z8.s, z2.h, z4.h[0]\n" + "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x646c404b // bfdot z11.s, z2.h, z4.h[1]\n" + ".inst 0x6474404e // bfdot z14.s, z2.h, z4.h[2]\n" + ".inst 0x647c4051 // bfdot z17.s, z2.h, z4.h[3]\n" + ".inst 0x64634054 // bfdot z20.s, z2.h, z3.h[0]\n" "addvl x22, x22, #3\n" - ".inst 0x646940f7 // bfdot z23.s, z7.h, z1.h[1]\n" - ".inst 0x647140fa // bfdot z26.s, z7.h, z1.h[2]\n" - ".inst 0x647940fd // bfdot z29.s, z7.h, z1.h[3]\n" - ".inst 0x64604089 // bfdot z9.s, z4.h, z0.h[0]\n" - ".inst 0x6468408c // bfdot z12.s, z4.h, z0.h[1]\n" - ".inst 0x6470408f // bfdot z15.s, z4.h, z0.h[2]\n" - ".inst 0x64784092 // bfdot z18.s, z4.h, z0.h[3]\n" - ".inst 0x64614095 // bfdot z21.s, z4.h, z1.h[0]\n" - ".inst 0x64694098 // bfdot z24.s, z4.h, z1.h[1]\n" - ".inst 0x6471409b // bfdot z27.s, z4.h, z1.h[2]\n" - ".inst 0x6479409e // bfdot z30.s, z4.h, z1.h[3]\n" - ".inst 0x646040aa // bfdot z10.s, z5.h, z0.h[0]\n" - ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n" - ".inst 0x647040b0 // bfdot z16.s, z5.h, z0.h[2]\n" - ".inst 0x647840b3 // bfdot z19.s, z5.h, z0.h[3]\n" - ".inst 0x646140b6 // bfdot z22.s, z5.h, z1.h[0]\n" - ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n" - ".inst 0x647140bc // bfdot z28.s, z5.h, z1.h[2]\n" - ".inst 0x647940bf // bfdot z31.s, z5.h, z1.h[3]\n" + ".inst 0x646b4057 // bfdot z23.s, z2.h, z3.h[1]\n" + ".inst 0x6473405a // bfdot z26.s, z2.h, z3.h[2]\n" + ".inst 0x647b405d // bfdot z29.s, z2.h, z3.h[3]\n" + ".inst 0x64644029 // bfdot z9.s, z1.h, z4.h[0]\n" + ".inst 0x646c402c // bfdot z12.s, z1.h, z4.h[1]\n" + ".inst 0x6474402f // bfdot z15.s, z1.h, z4.h[2]\n" + ".inst 0x647c4032 // bfdot z18.s, z1.h, z4.h[3]\n" + ".inst 0x64634035 // bfdot z21.s, z1.h, z3.h[0]\n" + ".inst 0x646b4038 // bfdot z24.s, z1.h, z3.h[1]\n" + ".inst 0x6473403b // bfdot z27.s, z1.h, z3.h[2]\n" + ".inst 0x647b403e // bfdot z30.s, z1.h, z3.h[3]\n" + ".inst 0x6464400a // bfdot z10.s, z0.h, z4.h[0]\n" + ".inst 0x646c400d // bfdot z13.s, z0.h, z4.h[1]\n" + ".inst 0x64744010 // bfdot z16.s, z0.h, z4.h[2]\n" + ".inst 0x647c4013 // bfdot z19.s, z0.h, z4.h[3]\n" + ".inst 0x64634016 // bfdot z22.s, z0.h, z3.h[0]\n" + ".inst 0x646b4019 // bfdot z25.s, z0.h, z3.h[1]\n" + ".inst 0x6473401c // bfdot z28.s, z0.h, z3.h[2]\n" + ".inst 0x647b401f // bfdot z31.s, z0.h, z3.h[3]\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -243,4 +247,4 @@ void sve_interleaved_bf16fp32_dot_8x3VL( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp index 1de8c68494..c5096ff4ba 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../bfloat.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 4; @@ -109,5 +104,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp index fe5382db05..ba7185752a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_bf16fp32_mmla_8x3VL( - const bfloat16 *Apanel, const bfloat16 *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *Apanel, + const bfloat16 *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,82 +89,82 @@ void sve_interleaved_bf16fp32_mmla_8x3VL( "mov z31.b, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel]]\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n" - "ld1h { z6.h }, p0/Z, [x22]\n" + "ld1h { z7.h }, p0/Z, [x22]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n" - "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n" - "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" - "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" - ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n" + "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6464e4da // bfmmla z26.s, z6.h, z4.h\n" + ".inst 0x6465e4dd // bfmmla z29.s, z6.h, z5.h\n" + "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n" + "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n" + ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n" + ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n" + ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n" "sub x20, x20, #0x2\n" - ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n" + ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n" + ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n" "cmp x20, #0x2\n" - ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n" - ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n" + ".inst 0x6467e4db // bfmmla z27.s, z6.h, z7.h\n" + ".inst 0x6463e4de // bfmmla z30.s, z6.h, z3.h\n" + "ld1h { z3.h }, p0/Z, [x22, #4, MUL VL]\n" + ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n" + ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n" - ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n" + ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n" + ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n" + ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n" + ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n" "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n" - ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n" - "ld1h { z4.h }, p0/Z, [x22, #6, MUL VL]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "ld1h { z5.h }, p0/Z, [x22, #7, MUL VL]\n" + ".inst 0x6465e4dc // bfmmla z28.s, z6.h, z5.h\n" + ".inst 0x6464e4df // bfmmla z31.s, z6.h, z4.h\n" + "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #64]\n" + "ld1h { z2.h }, p0/Z, [x22, #6, MUL VL]\n" + ".inst 0x6463e408 // bfmmla z8.s, z0.h, z3.h\n" + "ld1h { z4.h }, p0/Z, [x22, #7, MUL VL]\n" "addvl x22, x22, #16\n" ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" + ".inst 0x6463e42e // bfmmla z14.s, z1.h, z3.h\n" ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n" - ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x22, #-8, MUL VL]\n" + ".inst 0x6463e4b4 // bfmmla z20.s, z5.h, z3.h\n" + ".inst 0x6467e4b7 // bfmmla z23.s, z5.h, z7.h\n" + ".inst 0x6463e4da // bfmmla z26.s, z6.h, z3.h\n" + ".inst 0x6467e4dd // bfmmla z29.s, z6.h, z7.h\n" + "ld1h { z3.h }, p0/Z, [x22, #-8, MUL VL]\n" "ld1h { z7.h }, p0/Z, [x22, #-7, MUL VL]\n" - ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n" - ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n" - ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n" - ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n" - ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n" - ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n" - ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n" - ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n" + ".inst 0x6462e409 // bfmmla z9.s, z0.h, z2.h\n" + ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n" + ".inst 0x6462e42f // bfmmla z15.s, z1.h, z2.h\n" + ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n" + ".inst 0x6462e4b5 // bfmmla z21.s, z5.h, z2.h\n" + ".inst 0x6464e4b8 // bfmmla z24.s, z5.h, z4.h\n" + ".inst 0x6462e4db // bfmmla z27.s, z6.h, z2.h\n" + ".inst 0x6464e4de // bfmmla z30.s, z6.h, z4.h\n" "ld1h { z4.h }, p0/Z, [x22, #-6, MUL VL]\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" + ".inst 0x6463e40a // bfmmla z10.s, z0.h, z3.h\n" ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n" - ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n" + ".inst 0x6463e430 // bfmmla z16.s, z1.h, z3.h\n" ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n" "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" + ".inst 0x6463e4b6 // bfmmla z22.s, z5.h, z3.h\n" + ".inst 0x6467e4b9 // bfmmla z25.s, z5.h, z7.h\n" "ld1h { z5.h }, p0/Z, [x22, #-5, MUL VL]\n" - ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n" - ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" + ".inst 0x6463e4dc // bfmmla z28.s, z6.h, z3.h\n" + ".inst 0x6467e4df // bfmmla z31.s, z6.h, z7.h\n" "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n" "add %x[Apanel], %x[Apanel], #0x80\n" "addvl x22, x22, #-4\n" "bge 3b\n" "4:" // main loop skip - "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" @@ -168,114 +172,114 @@ void sve_interleaved_bf16fp32_mmla_8x3VL( "ld1h { z6.h }, p0/Z, [x22]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n" - "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n" - "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" - "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z3.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6464e4fa // bfmmla z26.s, z7.h, z4.h\n" + ".inst 0x6465e4fd // bfmmla z29.s, z7.h, z5.h\n" + "ld1h { z5.h }, p0/Z, [x22, #2, MUL VL]\n" + "ld1h { z4.h }, p0/Z, [x22, #3, MUL VL]\n" ".inst 0x6466e409 // bfmmla z9.s, z0.h, z6.h\n" - ".inst 0x6467e40c // bfmmla z12.s, z0.h, z7.h\n" + ".inst 0x6463e40c // bfmmla z12.s, z0.h, z3.h\n" ".inst 0x6466e42f // bfmmla z15.s, z1.h, z6.h\n" - ".inst 0x6467e432 // bfmmla z18.s, z1.h, z7.h\n" + ".inst 0x6463e432 // bfmmla z18.s, z1.h, z3.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" ".inst 0x6466e455 // bfmmla z21.s, z2.h, z6.h\n" - ".inst 0x6467e458 // bfmmla z24.s, z2.h, z7.h\n" + ".inst 0x6463e458 // bfmmla z24.s, z2.h, z3.h\n" "addvl x22, x22, #4\n" - ".inst 0x6466e47b // bfmmla z27.s, z3.h, z6.h\n" - ".inst 0x6467e47e // bfmmla z30.s, z3.h, z7.h\n" - ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n" - ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n" - ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n" - ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n" - ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n" - ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n" - ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n" - ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n" + ".inst 0x6466e4fb // bfmmla z27.s, z7.h, z6.h\n" + ".inst 0x6463e4fe // bfmmla z30.s, z7.h, z3.h\n" + ".inst 0x6465e40a // bfmmla z10.s, z0.h, z5.h\n" + ".inst 0x6464e40d // bfmmla z13.s, z0.h, z4.h\n" + ".inst 0x6465e430 // bfmmla z16.s, z1.h, z5.h\n" + ".inst 0x6464e433 // bfmmla z19.s, z1.h, z4.h\n" + ".inst 0x6465e456 // bfmmla z22.s, z2.h, z5.h\n" + ".inst 0x6464e459 // bfmmla z25.s, z2.h, z4.h\n" + ".inst 0x6465e4fc // bfmmla z28.s, z7.h, z5.h\n" + ".inst 0x6464e4ff // bfmmla z31.s, z7.h, z4.h\n" "cbz x20, 5f\n" - "ld1h { z6.h }, p0/Z, [x22]\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" - "ld1h { z7.h }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n" - "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n" - "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n" - ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n" - ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n" - ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n" - "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n" - ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n" - "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n" - "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n" - "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x6464e409 // bfmmla z9.s, z0.h, z4.h\n" - ".inst 0x6465e40c // bfmmla z12.s, z0.h, z5.h\n" + "ld1h { z1.h }, p0/Z, [x22]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel]]\n" + ".inst 0x6461e4e8 // bfmmla z8.s, z7.h, z1.h\n" + "ld1rqh { z6.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1h { z0.h }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x6460e4eb // bfmmla z11.s, z7.h, z0.h\n" + "ld1rqh { z5.h }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqh { z4.h }, p0/Z, [%x[Apanel], #48]\n" + ".inst 0x6461e4ce // bfmmla z14.s, z6.h, z1.h\n" + ".inst 0x6460e4d1 // bfmmla z17.s, z6.h, z0.h\n" + ".inst 0x6461e4b4 // bfmmla z20.s, z5.h, z1.h\n" + "ld1h { z3.h }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x6460e4b7 // bfmmla z23.s, z5.h, z0.h\n" + ".inst 0x6461e49a // bfmmla z26.s, z4.h, z1.h\n" + "ld1h { z2.h }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x6460e49d // bfmmla z29.s, z4.h, z0.h\n" + "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n" + "ld1h { z0.h }, p0/Z, [x22, #5, MUL VL]\n" + ".inst 0x6463e4e9 // bfmmla z9.s, z7.h, z3.h\n" + ".inst 0x6462e4ec // bfmmla z12.s, z7.h, z2.h\n" "addvl x22, x22, #6\n" - ".inst 0x6464e42f // bfmmla z15.s, z1.h, z4.h\n" - ".inst 0x6465e432 // bfmmla z18.s, z1.h, z5.h\n" + ".inst 0x6463e4cf // bfmmla z15.s, z6.h, z3.h\n" + ".inst 0x6462e4d2 // bfmmla z18.s, z6.h, z2.h\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6464e455 // bfmmla z21.s, z2.h, z4.h\n" - ".inst 0x6465e458 // bfmmla z24.s, z2.h, z5.h\n" - ".inst 0x6464e47b // bfmmla z27.s, z3.h, z4.h\n" - ".inst 0x6465e47e // bfmmla z30.s, z3.h, z5.h\n" - ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" - ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n" - ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n" - ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n" - ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n" - ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" - ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n" - ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" + ".inst 0x6463e4b5 // bfmmla z21.s, z5.h, z3.h\n" + ".inst 0x6462e4b8 // bfmmla z24.s, z5.h, z2.h\n" + ".inst 0x6463e49b // bfmmla z27.s, z4.h, z3.h\n" + ".inst 0x6462e49e // bfmmla z30.s, z4.h, z2.h\n" + ".inst 0x6461e4ea // bfmmla z10.s, z7.h, z1.h\n" + ".inst 0x6460e4ed // bfmmla z13.s, z7.h, z0.h\n" + ".inst 0x6461e4d0 // bfmmla z16.s, z6.h, z1.h\n" + ".inst 0x6460e4d3 // bfmmla z19.s, z6.h, z0.h\n" + ".inst 0x6461e4b6 // bfmmla z22.s, z5.h, z1.h\n" + ".inst 0x6460e4b9 // bfmmla z25.s, z5.h, z0.h\n" + ".inst 0x6461e49c // bfmmla z28.s, z4.h, z1.h\n" + ".inst 0x6460e49f // bfmmla z31.s, z4.h, z0.h\n" "5:" // multiply loop done - "uzp1 z4.d, z8.d, z11.d\n" + "uzp1 z0.d, z8.d, z11.d\n" "uzp2 z8.d, z8.d, z11.d\n" - "st1w { z4.s }, p0, [%x[Cpanel]]\n" - "uzp1 z11.d, z9.d, z12.d\n" + "st1w { z0.s }, p0, [%x[Cpanel]]\n" + "uzp1 z0.d, z9.d, z12.d\n" "uzp2 z9.d, z9.d, z12.d\n" - "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "uzp1 z12.d, z10.d, z13.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "uzp1 z0.d, z10.d, z13.d\n" "uzp2 z10.d, z10.d, z13.d\n" - "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "uzp1 z13.d, z14.d, z17.d\n" + "uzp1 z0.d, z14.d, z17.d\n" "uzp2 z14.d, z14.d, z17.d\n" "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n" - "uzp1 z17.d, z15.d, z18.d\n" + "uzp1 z1.d, z15.d, z18.d\n" "subs x23, x23, #0x1\n" "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "uzp2 z15.d, z15.d, z18.d\n" - "uzp1 z18.d, z16.d, z19.d\n" - "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "uzp1 z17.d, z16.d, z19.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "uzp2 z16.d, z16.d, z19.d\n" - "uzp1 z19.d, z20.d, z23.d\n" - "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "uzp1 z0.d, z20.d, z23.d\n" + "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n" "addvl %x[Cpanel], %x[Cpanel], #16\n" "uzp2 z20.d, z20.d, z23.d\n" - "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" "uzp1 z23.d, z21.d, z24.d\n" "uzp2 z21.d, z21.d, z24.d\n" "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" - "uzp1 z24.d, z22.d, z25.d\n" + "uzp1 z19.d, z22.d, z25.d\n" "uzp2 z22.d, z22.d, z25.d\n" "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" - "uzp1 z25.d, z26.d, z29.d\n" + "uzp1 z18.d, z26.d, z29.d\n" "uzp2 z26.d, z26.d, z29.d\n" "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" - "uzp1 z29.d, z27.d, z30.d\n" + "uzp1 z17.d, z27.d, z30.d\n" "uzp2 z27.d, z27.d, z30.d\n" - "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" - "uzp1 z30.d, z28.d, z31.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "uzp1 z16.d, z28.d, z31.d\n" "uzp2 z28.d, z28.d, z31.d\n" "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" - "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" "st1w { z21.s }, p0, [%x[Cpanel]]\n" "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n" - "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n" "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n" @@ -290,4 +294,4 @@ void sve_interleaved_bf16fp32_mmla_8x3VL( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp index 6f1089d517..6c54167763 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return get_vector_length<__fp16>() * 3; } - static unsigned int stripe_width() - { - return get_vector_length<__fp16>(); - } - static constexpr unsigned int k_unroll() { return 1; @@ -81,6 +76,8 @@ public: return { 13.84, 2.07, 2.52 }; case CPUModel::V1: return { 31.90, 5.15, 10.34 }; + case CPUModel::A64FX: + return { 44.34, 3.23, 7.06 }; } } @@ -104,5 +101,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp index 9287509889..609277d889 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp @@ -28,8 +28,12 @@ namespace arm_gemm { void sve_interleaved_fp16_mla_8x3VL_a64fx( - const __fp16 *Apanel, const __fp16 *Bpanel, - __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *Apanel, + const __fp16 *Bpanel, + __fp16 *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -88,7 +92,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( "fmla z9.h, p0/M, z1.h, z3.h\n" "sub x20, x20, #0x2\n" "fmla z10.h, p0/M, z2.h, z3.h\n" - "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n" "fmla z11.h, p0/M, z0.h, z4.h\n" "fmla z12.h, p0/M, z1.h, z4.h\n" "fmla z13.h, p0/M, z2.h, z4.h\n" @@ -97,63 +101,63 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( "fmla z15.h, p0/M, z1.h, z5.h\n" "cmp x20, #0x2\n" "fmla z16.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #12]\n" "fmla z17.h, p0/M, z0.h, z6.h\n" "fmla z18.h, p0/M, z1.h, z6.h\n" "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" - "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1rh { z5.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z0.h, z7.h\n" + "fmla z21.h, p0/M, z1.h, z7.h\n" + "fmla z22.h, p0/M, z2.h, z7.h\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #16]\n" "fmla z23.h, p0/M, z0.h, z4.h\n" "fmla z24.h, p0/M, z1.h, z4.h\n" "fmla z25.h, p0/M, z2.h, z4.h\n" "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "ld1h { z0.h }, p0/Z, [x22, #3, MUL VL]\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" - "ld1h { z1.h }, p0/Z, [x22, #4, MUL VL]\n" - "ld1h { z2.h }, p0/Z, [x22, #5, MUL VL]\n" - "fmla z8.h, p0/M, z0.h, z3.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n" - "fmla z9.h, p0/M, z1.h, z3.h\n" - "fmla z10.h, p0/M, z2.h, z3.h\n" - "fmla z11.h, p0/M, z0.h, z4.h\n" - "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n" - "fmla z12.h, p0/M, z1.h, z4.h\n" - "fmla z13.h, p0/M, z2.h, z4.h\n" + "fmla z26.h, p0/M, z0.h, z3.h\n" + "fmla z27.h, p0/M, z1.h, z3.h\n" + "fmla z28.h, p0/M, z2.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #20]\n" + "fmla z29.h, p0/M, z0.h, z5.h\n" + "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n" + "fmla z30.h, p0/M, z1.h, z5.h\n" + "fmla z31.h, p0/M, z2.h, z5.h\n" + "ld1h { z2.h }, p0/Z, [x22, #4, MUL VL]\n" + "ld1h { z5.h }, p0/Z, [x22, #5, MUL VL]\n" + "fmla z8.h, p0/M, z6.h, z7.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #22]\n" + "fmla z9.h, p0/M, z2.h, z7.h\n" + "fmla z10.h, p0/M, z5.h, z7.h\n" + "fmla z11.h, p0/M, z6.h, z4.h\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #24]\n" + "fmla z12.h, p0/M, z2.h, z4.h\n" + "fmla z13.h, p0/M, z5.h, z4.h\n" "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n" - "fmla z14.h, p0/M, z0.h, z5.h\n" - "fmla z15.h, p0/M, z1.h, z5.h\n" + "fmla z14.h, p0/M, z6.h, z3.h\n" + "fmla z15.h, p0/M, z2.h, z3.h\n" "addvl x22, x22, #6\n" - "fmla z16.h, p0/M, z2.h, z5.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n" - "fmla z17.h, p0/M, z0.h, z6.h\n" - "fmla z18.h, p0/M, z1.h, z6.h\n" - "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n" + "fmla z16.h, p0/M, z5.h, z3.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #28]\n" + "fmla z17.h, p0/M, z6.h, z1.h\n" + "fmla z18.h, p0/M, z2.h, z1.h\n" + "fmla z19.h, p0/M, z5.h, z1.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #30]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" - "fmla z23.h, p0/M, z0.h, z4.h\n" + "fmla z20.h, p0/M, z6.h, z7.h\n" + "fmla z21.h, p0/M, z2.h, z7.h\n" + "fmla z22.h, p0/M, z5.h, z7.h\n" + "fmla z23.h, p0/M, z6.h, z4.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" - "fmla z24.h, p0/M, z1.h, z4.h\n" - "fmla z25.h, p0/M, z2.h, z4.h\n" + "fmla z24.h, p0/M, z2.h, z4.h\n" + "fmla z25.h, p0/M, z5.h, z4.h\n" "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" + "fmla z26.h, p0/M, z6.h, z0.h\n" + "fmla z27.h, p0/M, z2.h, z0.h\n" + "fmla z28.h, p0/M, z5.h, z0.h\n" + "fmla z29.h, p0/M, z6.h, z1.h\n" "ld1h { z0.h }, p0/Z, [x22]\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z30.h, p0/M, z2.h, z1.h\n" + "fmla z31.h, p0/M, z5.h, z1.h\n" "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n" "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" @@ -164,7 +168,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( "fmla z9.h, p0/M, z1.h, z3.h\n" "addvl x22, x22, #3\n" "fmla z10.h, p0/M, z2.h, z3.h\n" - "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" + "ld1rh { z7.h }, p0/Z, [%x[Apanel], #8]\n" "fmla z11.h, p0/M, z0.h, z4.h\n" "fmla z12.h, p0/M, z1.h, z4.h\n" "fmla z13.h, p0/M, z2.h, z4.h\n" @@ -176,58 +180,58 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( "fmla z17.h, p0/M, z0.h, z6.h\n" "fmla z18.h, p0/M, z1.h, z6.h\n" "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" + "ld1rh { z3.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z0.h, z7.h\n" + "fmla z21.h, p0/M, z1.h, z7.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" + "fmla z22.h, p0/M, z2.h, z7.h\n" "fmla z23.h, p0/M, z0.h, z4.h\n" "fmla z24.h, p0/M, z1.h, z4.h\n" "fmla z25.h, p0/M, z2.h, z4.h\n" "fmla z26.h, p0/M, z0.h, z5.h\n" "fmla z27.h, p0/M, z1.h, z5.h\n" "fmla z28.h, p0/M, z2.h, z5.h\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z29.h, p0/M, z0.h, z3.h\n" + "fmla z30.h, p0/M, z1.h, z3.h\n" + "fmla z31.h, p0/M, z2.h, z3.h\n" "cbz x20, 5f\n" - "ld1h { z0.h }, p0/Z, [x22]\n" - "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" - "ld1h { z2.h }, p0/Z, [x22, #2, MUL VL]\n" + "ld1h { z6.h }, p0/Z, [x22]\n" + "ld1h { z5.h }, p0/Z, [x22, #1, MUL VL]\n" + "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n" - "fmla z8.h, p0/M, z0.h, z3.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n" - "fmla z9.h, p0/M, z1.h, z3.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n" - "fmla z10.h, p0/M, z2.h, z3.h\n" - "fmla z11.h, p0/M, z0.h, z4.h\n" - "fmla z12.h, p0/M, z1.h, z4.h\n" - "fmla z13.h, p0/M, z2.h, z4.h\n" + "fmla z8.h, p0/M, z6.h, z3.h\n" + "ld1rh { z2.h }, p0/Z, [%x[Apanel], #2]\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #4]\n" + "fmla z9.h, p0/M, z5.h, z3.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #6]\n" + "fmla z10.h, p0/M, z4.h, z3.h\n" + "fmla z11.h, p0/M, z6.h, z2.h\n" + "fmla z12.h, p0/M, z5.h, z2.h\n" + "fmla z13.h, p0/M, z4.h, z2.h\n" "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n" - "fmla z14.h, p0/M, z0.h, z5.h\n" - "fmla z15.h, p0/M, z1.h, z5.h\n" - "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n" - "fmla z16.h, p0/M, z2.h, z5.h\n" - "fmla z17.h, p0/M, z0.h, z6.h\n" - "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n" - "fmla z18.h, p0/M, z1.h, z6.h\n" - "fmla z19.h, p0/M, z2.h, z6.h\n" - "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n" - "fmla z20.h, p0/M, z0.h, z3.h\n" - "fmla z21.h, p0/M, z1.h, z3.h\n" + "fmla z14.h, p0/M, z6.h, z1.h\n" + "fmla z15.h, p0/M, z5.h, z1.h\n" + "ld1rh { z2.h }, p0/Z, [%x[Apanel], #10]\n" + "fmla z16.h, p0/M, z4.h, z1.h\n" + "fmla z17.h, p0/M, z6.h, z0.h\n" + "ld1rh { z1.h }, p0/Z, [%x[Apanel], #12]\n" + "fmla z18.h, p0/M, z5.h, z0.h\n" + "fmla z19.h, p0/M, z4.h, z0.h\n" + "ld1rh { z0.h }, p0/Z, [%x[Apanel], #14]\n" + "fmla z20.h, p0/M, z6.h, z3.h\n" + "fmla z21.h, p0/M, z5.h, z3.h\n" "addvl x22, x22, #3\n" - "fmla z22.h, p0/M, z2.h, z3.h\n" - "fmla z23.h, p0/M, z0.h, z4.h\n" + "fmla z22.h, p0/M, z4.h, z3.h\n" + "fmla z23.h, p0/M, z6.h, z2.h\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla z24.h, p0/M, z1.h, z4.h\n" - "fmla z25.h, p0/M, z2.h, z4.h\n" - "fmla z26.h, p0/M, z0.h, z5.h\n" - "fmla z27.h, p0/M, z1.h, z5.h\n" - "fmla z28.h, p0/M, z2.h, z5.h\n" - "fmla z29.h, p0/M, z0.h, z6.h\n" - "fmla z30.h, p0/M, z1.h, z6.h\n" - "fmla z31.h, p0/M, z2.h, z6.h\n" + "fmla z24.h, p0/M, z5.h, z2.h\n" + "fmla z25.h, p0/M, z4.h, z2.h\n" + "fmla z26.h, p0/M, z6.h, z1.h\n" + "fmla z27.h, p0/M, z5.h, z1.h\n" + "fmla z28.h, p0/M, z4.h, z1.h\n" + "fmla z29.h, p0/M, z6.h, z0.h\n" + "fmla z30.h, p0/M, z5.h, z0.h\n" + "fmla z31.h, p0/M, z4.h, z0.h\n" "5:" // multiply loop done "st1h { z8.h }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -261,7 +265,7 @@ void sve_interleaved_fp16_mla_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp index 1ac2ac075e..3b16c97e2c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp @@ -28,8 +28,12 @@ namespace arm_gemm { void sve_interleaved_fp16_mla_8x3VL( - const __fp16 *Apanel, const __fp16 *Bpanel, - __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *Apanel, + const __fp16 *Bpanel, + __fp16 *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -83,16 +87,16 @@ void sve_interleaved_fp16_mla_8x3VL( "3:" // main loop head "fmla z8.h, z2.h, z0.h[0]\n" "fmla z11.h, z2.h, z0.h[1]\n" - "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqh { z7.h }, p0/Z, [%x[Apanel], #16]\n" "fmla z14.h, z2.h, z0.h[2]\n" "fmla z17.h, z2.h, z0.h[3]\n" - "ld1h { z5.h }, p0/Z, [x22, #3, MUL VL]\n" + "ld1h { z6.h }, p0/Z, [x22, #3, MUL VL]\n" "fmla z20.h, z2.h, z0.h[4]\n" "fmla z23.h, z2.h, z0.h[5]\n" - "ld1h { z6.h }, p0/Z, [x22, #4, MUL VL]\n" + "ld1h { z5.h }, p0/Z, [x22, #4, MUL VL]\n" "fmla z26.h, z2.h, z0.h[6]\n" "fmla z29.h, z2.h, z0.h[7]\n" - "ld1h { z7.h }, p0/Z, [x22, #5, MUL VL]\n" + "ld1h { z1.h }, p0/Z, [x22, #5, MUL VL]\n" "fmla z9.h, z3.h, z0.h[0]\n" "fmla z12.h, z3.h, z0.h[1]\n" "addvl x22, x22, #6\n" @@ -116,31 +120,31 @@ void sve_interleaved_fp16_mla_8x3VL( "fmla z28.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "fmla z8.h, z5.h, z1.h[0]\n" - "fmla z11.h, z5.h, z1.h[1]\n" + "fmla z8.h, z6.h, z7.h[0]\n" + "fmla z11.h, z6.h, z7.h[1]\n" "ld1h { z4.h }, p0/Z, [x22, #2, MUL VL]\n" - "fmla z14.h, z5.h, z1.h[2]\n" - "fmla z17.h, z5.h, z1.h[3]\n" - "fmla z20.h, z5.h, z1.h[4]\n" - "fmla z23.h, z5.h, z1.h[5]\n" - "fmla z26.h, z5.h, z1.h[6]\n" - "fmla z29.h, z5.h, z1.h[7]\n" - "fmla z9.h, z6.h, z1.h[0]\n" - "fmla z12.h, z6.h, z1.h[1]\n" - "fmla z15.h, z6.h, z1.h[2]\n" - "fmla z18.h, z6.h, z1.h[3]\n" - "fmla z21.h, z6.h, z1.h[4]\n" - "fmla z24.h, z6.h, z1.h[5]\n" - "fmla z27.h, z6.h, z1.h[6]\n" - "fmla z30.h, z6.h, z1.h[7]\n" - "fmla z10.h, z7.h, z1.h[0]\n" - "fmla z13.h, z7.h, z1.h[1]\n" - "fmla z16.h, z7.h, z1.h[2]\n" - "fmla z19.h, z7.h, z1.h[3]\n" - "fmla z22.h, z7.h, z1.h[4]\n" - "fmla z25.h, z7.h, z1.h[5]\n" - "fmla z28.h, z7.h, z1.h[6]\n" - "fmla z31.h, z7.h, z1.h[7]\n" + "fmla z14.h, z6.h, z7.h[2]\n" + "fmla z17.h, z6.h, z7.h[3]\n" + "fmla z20.h, z6.h, z7.h[4]\n" + "fmla z23.h, z6.h, z7.h[5]\n" + "fmla z26.h, z6.h, z7.h[6]\n" + "fmla z29.h, z6.h, z7.h[7]\n" + "fmla z9.h, z5.h, z7.h[0]\n" + "fmla z12.h, z5.h, z7.h[1]\n" + "fmla z15.h, z5.h, z7.h[2]\n" + "fmla z18.h, z5.h, z7.h[3]\n" + "fmla z21.h, z5.h, z7.h[4]\n" + "fmla z24.h, z5.h, z7.h[5]\n" + "fmla z27.h, z5.h, z7.h[6]\n" + "fmla z30.h, z5.h, z7.h[7]\n" + "fmla z10.h, z1.h, z7.h[0]\n" + "fmla z13.h, z1.h, z7.h[1]\n" + "fmla z16.h, z1.h, z7.h[2]\n" + "fmla z19.h, z1.h, z7.h[3]\n" + "fmla z22.h, z1.h, z7.h[4]\n" + "fmla z25.h, z1.h, z7.h[5]\n" + "fmla z28.h, z1.h, z7.h[6]\n" + "fmla z31.h, z1.h, z7.h[7]\n" "bge 3b\n" "4:" // main loop skip "fmla z8.h, z2.h, z0.h[0]\n" @@ -170,36 +174,36 @@ void sve_interleaved_fp16_mla_8x3VL( "fmla z28.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "cbz x20, 5f\n" - "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n" - "ld1h { z5.h }, p0/Z, [x22]\n" - "fmla z8.h, z5.h, z0.h[0]\n" - "ld1h { z6.h }, p0/Z, [x22, #1, MUL VL]\n" - "ld1h { z7.h }, p0/Z, [x22, #2, MUL VL]\n" - "fmla z11.h, z5.h, z0.h[1]\n" - "fmla z14.h, z5.h, z0.h[2]\n" - "fmla z17.h, z5.h, z0.h[3]\n" + "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n" + "ld1h { z2.h }, p0/Z, [x22]\n" + "fmla z8.h, z2.h, z3.h[0]\n" + "ld1h { z1.h }, p0/Z, [x22, #1, MUL VL]\n" + "ld1h { z0.h }, p0/Z, [x22, #2, MUL VL]\n" + "fmla z11.h, z2.h, z3.h[1]\n" + "fmla z14.h, z2.h, z3.h[2]\n" + "fmla z17.h, z2.h, z3.h[3]\n" "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla z20.h, z5.h, z0.h[4]\n" - "fmla z23.h, z5.h, z0.h[5]\n" + "fmla z20.h, z2.h, z3.h[4]\n" + "fmla z23.h, z2.h, z3.h[5]\n" "addvl x22, x22, #3\n" - "fmla z26.h, z5.h, z0.h[6]\n" - "fmla z29.h, z5.h, z0.h[7]\n" - "fmla z9.h, z6.h, z0.h[0]\n" - "fmla z12.h, z6.h, z0.h[1]\n" - "fmla z15.h, z6.h, z0.h[2]\n" - "fmla z18.h, z6.h, z0.h[3]\n" - "fmla z21.h, z6.h, z0.h[4]\n" - "fmla z24.h, z6.h, z0.h[5]\n" - "fmla z27.h, z6.h, z0.h[6]\n" - "fmla z30.h, z6.h, z0.h[7]\n" - "fmla z10.h, z7.h, z0.h[0]\n" - "fmla z13.h, z7.h, z0.h[1]\n" - "fmla z16.h, z7.h, z0.h[2]\n" - "fmla z19.h, z7.h, z0.h[3]\n" - "fmla z22.h, z7.h, z0.h[4]\n" - "fmla z25.h, z7.h, z0.h[5]\n" - "fmla z28.h, z7.h, z0.h[6]\n" - "fmla z31.h, z7.h, z0.h[7]\n" + "fmla z26.h, z2.h, z3.h[6]\n" + "fmla z29.h, z2.h, z3.h[7]\n" + "fmla z9.h, z1.h, z3.h[0]\n" + "fmla z12.h, z1.h, z3.h[1]\n" + "fmla z15.h, z1.h, z3.h[2]\n" + "fmla z18.h, z1.h, z3.h[3]\n" + "fmla z21.h, z1.h, z3.h[4]\n" + "fmla z24.h, z1.h, z3.h[5]\n" + "fmla z27.h, z1.h, z3.h[6]\n" + "fmla z30.h, z1.h, z3.h[7]\n" + "fmla z10.h, z0.h, z3.h[0]\n" + "fmla z13.h, z0.h, z3.h[1]\n" + "fmla z16.h, z0.h, z3.h[2]\n" + "fmla z19.h, z0.h, z3.h[3]\n" + "fmla z22.h, z0.h, z3.h[4]\n" + "fmla z25.h, z0.h, z3.h[5]\n" + "fmla z28.h, z0.h, z3.h[6]\n" + "fmla z31.h, z0.h, z3.h[7]\n" "5:" // multiply loop done "st1h { z8.h }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp index 29b928ee3b..23ab7ce10a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -56,11 +56,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 1; @@ -75,10 +70,14 @@ public: if (std::is_same::value) { switch (ci->get_cpu_model()) { - case CPUModel::V1: - return { 15.15, 9.24, 6.42 }; default: return { 7.2307, 3.876, 2.932 }; + case CPUModel::A64FX: + return { 26.52, 3.42, 4.59 }; + case CPUModel::A510: + return { 6.25, 3.84, 2.47 }; + case CPUModel::V1: + return { 15.15, 9.24, 6.42 }; } } @@ -102,5 +101,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp index 3141a258a8..0b13913717 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp @@ -28,8 +28,12 @@ namespace arm_gemm { void sve_interleaved_fp32_mla_8x3VL_a64fx( - const float *Apanel, const float *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const float *Apanel, + const float *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -88,7 +92,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( "fmla z9.s, p0/M, z1.s, z3.s\n" "sub x20, x20, #0x2\n" "fmla z10.s, p0/M, z2.s, z3.s\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "fmla z11.s, p0/M, z0.s, z4.s\n" "fmla z12.s, p0/M, z1.s, z4.s\n" "fmla z13.s, p0/M, z2.s, z4.s\n" @@ -97,63 +101,63 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( "fmla z15.s, p0/M, z1.s, z5.s\n" "cmp x20, #0x2\n" "fmla z16.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n" "fmla z17.s, p0/M, z0.s, z6.s\n" "fmla z18.s, p0/M, z1.s, z6.s\n" "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z0.s, z7.s\n" + "fmla z21.s, p0/M, z1.s, z7.s\n" + "fmla z22.s, p0/M, z2.s, z7.s\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n" "fmla z23.s, p0/M, z0.s, z4.s\n" "fmla z24.s, p0/M, z1.s, z4.s\n" "fmla z25.s, p0/M, z2.s, z4.s\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "ld1w { z0.s }, p0/Z, [x22, #3, MUL VL]\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" - "ld1w { z1.s }, p0/Z, [x22, #4, MUL VL]\n" - "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n" - "fmla z8.s, p0/M, z0.s, z3.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" - "fmla z9.s, p0/M, z1.s, z3.s\n" - "fmla z10.s, p0/M, z2.s, z3.s\n" - "fmla z11.s, p0/M, z0.s, z4.s\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" - "fmla z12.s, p0/M, z1.s, z4.s\n" - "fmla z13.s, p0/M, z2.s, z4.s\n" + "fmla z26.s, p0/M, z0.s, z3.s\n" + "fmla z27.s, p0/M, z1.s, z3.s\n" + "fmla z28.s, p0/M, z2.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n" + "fmla z29.s, p0/M, z0.s, z5.s\n" + "ld1w { z6.s }, p0/Z, [x22, #3, MUL VL]\n" + "fmla z30.s, p0/M, z1.s, z5.s\n" + "fmla z31.s, p0/M, z2.s, z5.s\n" + "ld1w { z2.s }, p0/Z, [x22, #4, MUL VL]\n" + "ld1w { z5.s }, p0/Z, [x22, #5, MUL VL]\n" + "fmla z8.s, p0/M, z6.s, z7.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n" + "fmla z9.s, p0/M, z2.s, z7.s\n" + "fmla z10.s, p0/M, z5.s, z7.s\n" + "fmla z11.s, p0/M, z6.s, z4.s\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n" + "fmla z12.s, p0/M, z2.s, z4.s\n" + "fmla z13.s, p0/M, z5.s, z4.s\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" - "fmla z14.s, p0/M, z0.s, z5.s\n" - "fmla z15.s, p0/M, z1.s, z5.s\n" + "fmla z14.s, p0/M, z6.s, z3.s\n" + "fmla z15.s, p0/M, z2.s, z3.s\n" "addvl x22, x22, #6\n" - "fmla z16.s, p0/M, z2.s, z5.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" - "fmla z17.s, p0/M, z0.s, z6.s\n" - "fmla z18.s, p0/M, z1.s, z6.s\n" - "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "fmla z16.s, p0/M, z5.s, z3.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n" + "fmla z17.s, p0/M, z6.s, z1.s\n" + "fmla z18.s, p0/M, z2.s, z1.s\n" + "fmla z19.s, p0/M, z5.s, z1.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n" "add %x[Apanel], %x[Apanel], #0x40\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" - "fmla z23.s, p0/M, z0.s, z4.s\n" + "fmla z20.s, p0/M, z6.s, z7.s\n" + "fmla z21.s, p0/M, z2.s, z7.s\n" + "fmla z22.s, p0/M, z5.s, z7.s\n" + "fmla z23.s, p0/M, z6.s, z4.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "fmla z24.s, p0/M, z1.s, z4.s\n" - "fmla z25.s, p0/M, z2.s, z4.s\n" + "fmla z24.s, p0/M, z2.s, z4.s\n" + "fmla z25.s, p0/M, z5.s, z4.s\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" + "fmla z26.s, p0/M, z6.s, z0.s\n" + "fmla z27.s, p0/M, z2.s, z0.s\n" + "fmla z28.s, p0/M, z5.s, z0.s\n" + "fmla z29.s, p0/M, z6.s, z1.s\n" "ld1w { z0.s }, p0/Z, [x22]\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z30.s, p0/M, z2.s, z1.s\n" + "fmla z31.s, p0/M, z5.s, z1.s\n" "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n" "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" @@ -164,7 +168,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( "fmla z9.s, p0/M, z1.s, z3.s\n" "addvl x22, x22, #3\n" "fmla z10.s, p0/M, z2.s, z3.s\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "fmla z11.s, p0/M, z0.s, z4.s\n" "fmla z12.s, p0/M, z1.s, z4.s\n" "fmla z13.s, p0/M, z2.s, z4.s\n" @@ -176,58 +180,58 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( "fmla z17.s, p0/M, z0.s, z6.s\n" "fmla z18.s, p0/M, z1.s, z6.s\n" "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z0.s, z7.s\n" + "fmla z21.s, p0/M, z1.s, z7.s\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" + "fmla z22.s, p0/M, z2.s, z7.s\n" "fmla z23.s, p0/M, z0.s, z4.s\n" "fmla z24.s, p0/M, z1.s, z4.s\n" "fmla z25.s, p0/M, z2.s, z4.s\n" "fmla z26.s, p0/M, z0.s, z5.s\n" "fmla z27.s, p0/M, z1.s, z5.s\n" "fmla z28.s, p0/M, z2.s, z5.s\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z29.s, p0/M, z0.s, z3.s\n" + "fmla z30.s, p0/M, z1.s, z3.s\n" + "fmla z31.s, p0/M, z2.s, z3.s\n" "cbz x20, 5f\n" - "ld1w { z0.s }, p0/Z, [x22]\n" - "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n" - "ld1w { z2.s }, p0/Z, [x22, #2, MUL VL]\n" + "ld1w { z6.s }, p0/Z, [x22]\n" + "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n" + "ld1w { z4.s }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "fmla z8.s, p0/M, z0.s, z3.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" - "fmla z9.s, p0/M, z1.s, z3.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" - "fmla z10.s, p0/M, z2.s, z3.s\n" - "fmla z11.s, p0/M, z0.s, z4.s\n" - "fmla z12.s, p0/M, z1.s, z4.s\n" - "fmla z13.s, p0/M, z2.s, z4.s\n" + "fmla z8.s, p0/M, z6.s, z3.s\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n" + "fmla z9.s, p0/M, z5.s, z3.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n" + "fmla z10.s, p0/M, z4.s, z3.s\n" + "fmla z11.s, p0/M, z6.s, z2.s\n" + "fmla z12.s, p0/M, z5.s, z2.s\n" + "fmla z13.s, p0/M, z4.s, z2.s\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" - "fmla z14.s, p0/M, z0.s, z5.s\n" - "fmla z15.s, p0/M, z1.s, z5.s\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" - "fmla z16.s, p0/M, z2.s, z5.s\n" - "fmla z17.s, p0/M, z0.s, z6.s\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" - "fmla z18.s, p0/M, z1.s, z6.s\n" - "fmla z19.s, p0/M, z2.s, z6.s\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "fmla z20.s, p0/M, z0.s, z3.s\n" - "fmla z21.s, p0/M, z1.s, z3.s\n" + "fmla z14.s, p0/M, z6.s, z1.s\n" + "fmla z15.s, p0/M, z5.s, z1.s\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n" + "fmla z16.s, p0/M, z4.s, z1.s\n" + "fmla z17.s, p0/M, z6.s, z0.s\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n" + "fmla z18.s, p0/M, z5.s, z0.s\n" + "fmla z19.s, p0/M, z4.s, z0.s\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n" + "fmla z20.s, p0/M, z6.s, z3.s\n" + "fmla z21.s, p0/M, z5.s, z3.s\n" "addvl x22, x22, #3\n" - "fmla z22.s, p0/M, z2.s, z3.s\n" - "fmla z23.s, p0/M, z0.s, z4.s\n" + "fmla z22.s, p0/M, z4.s, z3.s\n" + "fmla z23.s, p0/M, z6.s, z2.s\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla z24.s, p0/M, z1.s, z4.s\n" - "fmla z25.s, p0/M, z2.s, z4.s\n" - "fmla z26.s, p0/M, z0.s, z5.s\n" - "fmla z27.s, p0/M, z1.s, z5.s\n" - "fmla z28.s, p0/M, z2.s, z5.s\n" - "fmla z29.s, p0/M, z0.s, z6.s\n" - "fmla z30.s, p0/M, z1.s, z6.s\n" - "fmla z31.s, p0/M, z2.s, z6.s\n" + "fmla z24.s, p0/M, z5.s, z2.s\n" + "fmla z25.s, p0/M, z4.s, z2.s\n" + "fmla z26.s, p0/M, z6.s, z1.s\n" + "fmla z27.s, p0/M, z5.s, z1.s\n" + "fmla z28.s, p0/M, z4.s, z1.s\n" + "fmla z29.s, p0/M, z6.s, z0.s\n" + "fmla z30.s, p0/M, z5.s, z0.s\n" + "fmla z31.s, p0/M, z4.s, z0.s\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -261,7 +265,7 @@ void sve_interleaved_fp32_mla_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp index 9d1c0c3728..c7f32ff7a9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp @@ -28,8 +28,12 @@ namespace arm_gemm { void sve_interleaved_fp32_mla_8x3VL( - const float *Apanel, const float *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { + const float *Apanel, + const float *Bpanel, + float *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -84,10 +88,10 @@ void sve_interleaved_fp32_mla_8x3VL( "3:" // main loop head "fmla z8.s, z4.s, z0.s[0]\n" "fmla z11.s, z4.s, z0.s[1]\n" - "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #32]\n" "fmla z14.s, z4.s, z0.s[2]\n" "fmla z17.s, z4.s, z0.s[3]\n" - "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqw { z7.s }, p0/Z, [%x[Apanel], #48]\n" "fmla z20.s, z4.s, z1.s[0]\n" "fmla z23.s, z4.s, z1.s[1]\n" "sub x20, x20, #0x2\n" @@ -114,35 +118,35 @@ void sve_interleaved_fp32_mla_8x3VL( "fmla z25.s, z6.s, z1.s[1]\n" "fmla z28.s, z6.s, z1.s[2]\n" "fmla z31.s, z6.s, z1.s[3]\n" - "ld1w { z6.s }, p0/Z, [x22, #5, MUL VL]\n" + "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n" "addvl x22, x22, #6\n" - "fmla z8.s, z4.s, z2.s[0]\n" - "fmla z11.s, z4.s, z2.s[1]\n" + "fmla z8.s, z4.s, z3.s[0]\n" + "fmla z11.s, z4.s, z3.s[1]\n" "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n" - "fmla z14.s, z4.s, z2.s[2]\n" - "fmla z17.s, z4.s, z2.s[3]\n" - "fmla z20.s, z4.s, z3.s[0]\n" - "fmla z23.s, z4.s, z3.s[1]\n" - "fmla z26.s, z4.s, z3.s[2]\n" - "fmla z29.s, z4.s, z3.s[3]\n" + "fmla z14.s, z4.s, z3.s[2]\n" + "fmla z17.s, z4.s, z3.s[3]\n" + "fmla z20.s, z4.s, z7.s[0]\n" + "fmla z23.s, z4.s, z7.s[1]\n" + "fmla z26.s, z4.s, z7.s[2]\n" + "fmla z29.s, z4.s, z7.s[3]\n" "ld1w { z4.s }, p0/Z, [x22]\n" - "fmla z9.s, z5.s, z2.s[0]\n" - "fmla z12.s, z5.s, z2.s[1]\n" - "fmla z15.s, z5.s, z2.s[2]\n" - "fmla z18.s, z5.s, z2.s[3]\n" - "fmla z21.s, z5.s, z3.s[0]\n" - "fmla z24.s, z5.s, z3.s[1]\n" - "fmla z27.s, z5.s, z3.s[2]\n" - "fmla z30.s, z5.s, z3.s[3]\n" + "fmla z9.s, z5.s, z3.s[0]\n" + "fmla z12.s, z5.s, z3.s[1]\n" + "fmla z15.s, z5.s, z3.s[2]\n" + "fmla z18.s, z5.s, z3.s[3]\n" + "fmla z21.s, z5.s, z7.s[0]\n" + "fmla z24.s, z5.s, z7.s[1]\n" + "fmla z27.s, z5.s, z7.s[2]\n" + "fmla z30.s, z5.s, z7.s[3]\n" "ld1w { z5.s }, p0/Z, [x22, #1, MUL VL]\n" - "fmla z10.s, z6.s, z2.s[0]\n" - "fmla z13.s, z6.s, z2.s[1]\n" - "fmla z16.s, z6.s, z2.s[2]\n" - "fmla z19.s, z6.s, z2.s[3]\n" - "fmla z22.s, z6.s, z3.s[0]\n" - "fmla z25.s, z6.s, z3.s[1]\n" - "fmla z28.s, z6.s, z3.s[2]\n" - "fmla z31.s, z6.s, z3.s[3]\n" + "fmla z10.s, z2.s, z3.s[0]\n" + "fmla z13.s, z2.s, z3.s[1]\n" + "fmla z16.s, z2.s, z3.s[2]\n" + "fmla z19.s, z2.s, z3.s[3]\n" + "fmla z22.s, z2.s, z7.s[0]\n" + "fmla z25.s, z2.s, z7.s[1]\n" + "fmla z28.s, z2.s, z7.s[2]\n" + "fmla z31.s, z2.s, z7.s[3]\n" "ld1w { z6.s }, p0/Z, [x22, #2, MUL VL]\n" "bge 3b\n" "4:" // main loop skip @@ -173,37 +177,37 @@ void sve_interleaved_fp32_mla_8x3VL( "fmla z28.s, z6.s, z1.s[2]\n" "fmla z31.s, z6.s, z1.s[3]\n" "cbz x20, 5f\n" - "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n" - "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqw { z4.s }, p0/Z, [%x[Apanel]]\n" + "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1w { z7.s }, p0/Z, [x22]\n" - "ld1w { z4.s }, p0/Z, [x22, #1, MUL VL]\n" - "fmla z8.s, z7.s, z0.s[0]\n" - "ld1w { z5.s }, p0/Z, [x22, #2, MUL VL]\n" - "fmla z11.s, z7.s, z0.s[1]\n" - "fmla z14.s, z7.s, z0.s[2]\n" - "fmla z17.s, z7.s, z0.s[3]\n" - "fmla z20.s, z7.s, z1.s[0]\n" + "ld1w { z2.s }, p0/Z, [x22]\n" + "ld1w { z1.s }, p0/Z, [x22, #1, MUL VL]\n" + "fmla z8.s, z2.s, z4.s[0]\n" + "ld1w { z0.s }, p0/Z, [x22, #2, MUL VL]\n" + "fmla z11.s, z2.s, z4.s[1]\n" + "fmla z14.s, z2.s, z4.s[2]\n" + "fmla z17.s, z2.s, z4.s[3]\n" + "fmla z20.s, z2.s, z3.s[0]\n" "addvl x22, x22, #3\n" - "fmla z23.s, z7.s, z1.s[1]\n" - "fmla z26.s, z7.s, z1.s[2]\n" - "fmla z29.s, z7.s, z1.s[3]\n" - "fmla z9.s, z4.s, z0.s[0]\n" - "fmla z12.s, z4.s, z0.s[1]\n" - "fmla z15.s, z4.s, z0.s[2]\n" - "fmla z18.s, z4.s, z0.s[3]\n" - "fmla z21.s, z4.s, z1.s[0]\n" - "fmla z24.s, z4.s, z1.s[1]\n" - "fmla z27.s, z4.s, z1.s[2]\n" - "fmla z30.s, z4.s, z1.s[3]\n" - "fmla z10.s, z5.s, z0.s[0]\n" - "fmla z13.s, z5.s, z0.s[1]\n" - "fmla z16.s, z5.s, z0.s[2]\n" - "fmla z19.s, z5.s, z0.s[3]\n" - "fmla z22.s, z5.s, z1.s[0]\n" - "fmla z25.s, z5.s, z1.s[1]\n" - "fmla z28.s, z5.s, z1.s[2]\n" - "fmla z31.s, z5.s, z1.s[3]\n" + "fmla z23.s, z2.s, z3.s[1]\n" + "fmla z26.s, z2.s, z3.s[2]\n" + "fmla z29.s, z2.s, z3.s[3]\n" + "fmla z9.s, z1.s, z4.s[0]\n" + "fmla z12.s, z1.s, z4.s[1]\n" + "fmla z15.s, z1.s, z4.s[2]\n" + "fmla z18.s, z1.s, z4.s[3]\n" + "fmla z21.s, z1.s, z3.s[0]\n" + "fmla z24.s, z1.s, z3.s[1]\n" + "fmla z27.s, z1.s, z3.s[2]\n" + "fmla z30.s, z1.s, z3.s[3]\n" + "fmla z10.s, z0.s, z4.s[0]\n" + "fmla z13.s, z0.s, z4.s[1]\n" + "fmla z16.s, z0.s, z4.s[2]\n" + "fmla z19.s, z0.s, z4.s[3]\n" + "fmla z22.s, z0.s, z3.s[0]\n" + "fmla z25.s, z0.s, z3.s[1]\n" + "fmla z28.s, z0.s, z3.s[2]\n" + "fmla z31.s, z0.s, z3.s[3]\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp index 0d707b0391..cf3069f828 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -35,6 +35,7 @@ namespace arm_gemm { // Actual kernel implementations void sve_interleaved_s8s32_dot_8x3VL( ARGLIST ); +void sve_interleaved_s8s32_dot_8x3VL_a64fx( ARGLIST ); class cls_sve_interleaved_s8s32_dot_8x3VL { @@ -55,11 +56,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 4; @@ -80,6 +76,8 @@ public: return { 63.30, 4.97, 11.35 }; case CPUModel::A510: return { 27.42, 3.47, 2.88 }; + case CPUModel::A64FX: + return { 109.18, 3.88, 7.85 }; } } @@ -92,6 +90,8 @@ public: return { 52.24, 7.49, 0.80 }; case CPUModel::A510: return { 27.47, 1.70, 0.28 }; + case CPUModel::A64FX: + return { 109.92, 2.36, 0.41 }; } } @@ -100,13 +100,19 @@ public: // Default to the generic kernel kern_type kernel=sve_interleaved_s8s32_dot_8x3VL; - cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *) + cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_interleaved_s8s32_dot_8x3VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp index a7ca48d87a..c668a7b746 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_s8s32_dot_8x3VL_a64fx( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *Apanel, + const int8_t *Bpanel, + int32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -89,7 +93,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( "sdot z9.s, z1.b, z3.b\n" "sub x20, x20, #0x2\n" "sdot z10.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "sdot z11.s, z0.b, z4.b\n" "sdot z12.s, z1.b, z4.b\n" "sdot z13.s, z2.b, z4.b\n" @@ -98,63 +102,63 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( "sdot z15.s, z1.b, z5.b\n" "cmp x20, #0x2\n" "sdot z16.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n" "sdot z17.s, z0.b, z6.b\n" "sdot z18.s, z1.b, z6.b\n" "sdot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "sdot z20.s, z0.b, z3.b\n" - "sdot z21.s, z1.b, z3.b\n" - "sdot z22.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n" + "sdot z20.s, z0.b, z7.b\n" + "sdot z21.s, z1.b, z7.b\n" + "sdot z22.s, z2.b, z7.b\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n" "sdot z23.s, z0.b, z4.b\n" "sdot z24.s, z1.b, z4.b\n" "sdot z25.s, z2.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" - "sdot z26.s, z0.b, z5.b\n" - "sdot z27.s, z1.b, z5.b\n" - "sdot z28.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" - "sdot z29.s, z0.b, z6.b\n" - "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n" - "sdot z30.s, z1.b, z6.b\n" - "sdot z31.s, z2.b, z6.b\n" - "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n" - "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n" - "sdot z8.s, z0.b, z3.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" - "sdot z9.s, z1.b, z3.b\n" - "sdot z10.s, z2.b, z3.b\n" - "sdot z11.s, z0.b, z4.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" - "sdot z12.s, z1.b, z4.b\n" - "sdot z13.s, z2.b, z4.b\n" + "sdot z26.s, z0.b, z3.b\n" + "sdot z27.s, z1.b, z3.b\n" + "sdot z28.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n" + "sdot z29.s, z0.b, z5.b\n" + "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n" + "sdot z30.s, z1.b, z5.b\n" + "sdot z31.s, z2.b, z5.b\n" + "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n" + "sdot z8.s, z6.b, z7.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n" + "sdot z9.s, z2.b, z7.b\n" + "sdot z10.s, z5.b, z7.b\n" + "sdot z11.s, z6.b, z4.b\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n" + "sdot z12.s, z2.b, z4.b\n" + "sdot z13.s, z5.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" - "sdot z14.s, z0.b, z5.b\n" - "sdot z15.s, z1.b, z5.b\n" + "sdot z14.s, z6.b, z3.b\n" + "sdot z15.s, z2.b, z3.b\n" "addvl x22, x22, #6\n" - "sdot z16.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" - "sdot z17.s, z0.b, z6.b\n" - "sdot z18.s, z1.b, z6.b\n" - "sdot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "sdot z16.s, z5.b, z3.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n" + "sdot z17.s, z6.b, z1.b\n" + "sdot z18.s, z2.b, z1.b\n" + "sdot z19.s, z5.b, z1.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n" "add %x[Apanel], %x[Apanel], #0x40\n" - "sdot z20.s, z0.b, z3.b\n" - "sdot z21.s, z1.b, z3.b\n" - "sdot z22.s, z2.b, z3.b\n" - "sdot z23.s, z0.b, z4.b\n" + "sdot z20.s, z6.b, z7.b\n" + "sdot z21.s, z2.b, z7.b\n" + "sdot z22.s, z5.b, z7.b\n" + "sdot z23.s, z6.b, z4.b\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "sdot z24.s, z1.b, z4.b\n" - "sdot z25.s, z2.b, z4.b\n" + "sdot z24.s, z2.b, z4.b\n" + "sdot z25.s, z5.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "sdot z26.s, z0.b, z5.b\n" - "sdot z27.s, z1.b, z5.b\n" - "sdot z28.s, z2.b, z5.b\n" - "sdot z29.s, z0.b, z6.b\n" + "sdot z26.s, z6.b, z0.b\n" + "sdot z27.s, z2.b, z0.b\n" + "sdot z28.s, z5.b, z0.b\n" + "sdot z29.s, z6.b, z1.b\n" "ld1b { z0.b }, p0/Z, [x22]\n" - "sdot z30.s, z1.b, z6.b\n" - "sdot z31.s, z2.b, z6.b\n" + "sdot z30.s, z2.b, z1.b\n" + "sdot z31.s, z5.b, z1.b\n" "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" @@ -165,7 +169,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( "sdot z9.s, z1.b, z3.b\n" "addvl x22, x22, #3\n" "sdot z10.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "sdot z11.s, z0.b, z4.b\n" "sdot z12.s, z1.b, z4.b\n" "sdot z13.s, z2.b, z4.b\n" @@ -177,58 +181,58 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( "sdot z17.s, z0.b, z6.b\n" "sdot z18.s, z1.b, z6.b\n" "sdot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "sdot z20.s, z0.b, z3.b\n" - "sdot z21.s, z1.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n" + "sdot z20.s, z0.b, z7.b\n" + "sdot z21.s, z1.b, z7.b\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "sdot z22.s, z2.b, z3.b\n" + "sdot z22.s, z2.b, z7.b\n" "sdot z23.s, z0.b, z4.b\n" "sdot z24.s, z1.b, z4.b\n" "sdot z25.s, z2.b, z4.b\n" "sdot z26.s, z0.b, z5.b\n" "sdot z27.s, z1.b, z5.b\n" "sdot z28.s, z2.b, z5.b\n" - "sdot z29.s, z0.b, z6.b\n" - "sdot z30.s, z1.b, z6.b\n" - "sdot z31.s, z2.b, z6.b\n" + "sdot z29.s, z0.b, z3.b\n" + "sdot z30.s, z1.b, z3.b\n" + "sdot z31.s, z2.b, z3.b\n" "cbz x20, 5f\n" - "ld1b { z0.b }, p0/Z, [x22]\n" - "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" - "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z6.b }, p0/Z, [x22]\n" + "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "sdot z8.s, z0.b, z3.b\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" - "sdot z9.s, z1.b, z3.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" - "sdot z10.s, z2.b, z3.b\n" - "sdot z11.s, z0.b, z4.b\n" - "sdot z12.s, z1.b, z4.b\n" - "sdot z13.s, z2.b, z4.b\n" + "sdot z8.s, z6.b, z3.b\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n" + "sdot z9.s, z5.b, z3.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n" + "sdot z10.s, z4.b, z3.b\n" + "sdot z11.s, z6.b, z2.b\n" + "sdot z12.s, z5.b, z2.b\n" + "sdot z13.s, z4.b, z2.b\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" - "sdot z14.s, z0.b, z5.b\n" - "sdot z15.s, z1.b, z5.b\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" - "sdot z16.s, z2.b, z5.b\n" - "sdot z17.s, z0.b, z6.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" - "sdot z18.s, z1.b, z6.b\n" - "sdot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "sdot z20.s, z0.b, z3.b\n" - "sdot z21.s, z1.b, z3.b\n" + "sdot z14.s, z6.b, z1.b\n" + "sdot z15.s, z5.b, z1.b\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n" + "sdot z16.s, z4.b, z1.b\n" + "sdot z17.s, z6.b, z0.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n" + "sdot z18.s, z5.b, z0.b\n" + "sdot z19.s, z4.b, z0.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n" + "sdot z20.s, z6.b, z3.b\n" + "sdot z21.s, z5.b, z3.b\n" "addvl x22, x22, #3\n" - "sdot z22.s, z2.b, z3.b\n" - "sdot z23.s, z0.b, z4.b\n" + "sdot z22.s, z4.b, z3.b\n" + "sdot z23.s, z6.b, z2.b\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "sdot z24.s, z1.b, z4.b\n" - "sdot z25.s, z2.b, z4.b\n" - "sdot z26.s, z0.b, z5.b\n" - "sdot z27.s, z1.b, z5.b\n" - "sdot z28.s, z2.b, z5.b\n" - "sdot z29.s, z0.b, z6.b\n" - "sdot z30.s, z1.b, z6.b\n" - "sdot z31.s, z2.b, z6.b\n" + "sdot z24.s, z5.b, z2.b\n" + "sdot z25.s, z4.b, z2.b\n" + "sdot z26.s, z6.b, z1.b\n" + "sdot z27.s, z5.b, z1.b\n" + "sdot z28.s, z4.b, z1.b\n" + "sdot z29.s, z6.b, z0.b\n" + "sdot z30.s, z5.b, z0.b\n" + "sdot z31.s, z4.b, z0.b\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -262,7 +266,7 @@ void sve_interleaved_s8s32_dot_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp index e5f59d220b..f6e1a75c15 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_s8s32_dot_8x3VL( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *Apanel, + const int8_t *Bpanel, + int32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,10 +89,10 @@ void sve_interleaved_s8s32_dot_8x3VL( "3:" // main loop head "sdot z8.s, z4.b, z0.b[0]\n" "sdot z11.s, z4.b, z0.b[1]\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n" "sdot z14.s, z4.b, z0.b[2]\n" "sdot z17.s, z4.b, z0.b[3]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n" "sdot z20.s, z4.b, z1.b[0]\n" "sdot z23.s, z4.b, z1.b[1]\n" "sub x20, x20, #0x2\n" @@ -115,35 +119,35 @@ void sve_interleaved_s8s32_dot_8x3VL( "sdot z25.s, z6.b, z1.b[1]\n" "sdot z28.s, z6.b, z1.b[2]\n" "sdot z31.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n" "addvl x22, x22, #6\n" - "sdot z8.s, z4.b, z2.b[0]\n" - "sdot z11.s, z4.b, z2.b[1]\n" + "sdot z8.s, z4.b, z3.b[0]\n" + "sdot z11.s, z4.b, z3.b[1]\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" - "sdot z14.s, z4.b, z2.b[2]\n" - "sdot z17.s, z4.b, z2.b[3]\n" - "sdot z20.s, z4.b, z3.b[0]\n" - "sdot z23.s, z4.b, z3.b[1]\n" - "sdot z26.s, z4.b, z3.b[2]\n" - "sdot z29.s, z4.b, z3.b[3]\n" + "sdot z14.s, z4.b, z3.b[2]\n" + "sdot z17.s, z4.b, z3.b[3]\n" + "sdot z20.s, z4.b, z7.b[0]\n" + "sdot z23.s, z4.b, z7.b[1]\n" + "sdot z26.s, z4.b, z7.b[2]\n" + "sdot z29.s, z4.b, z7.b[3]\n" "ld1b { z4.b }, p0/Z, [x22]\n" - "sdot z9.s, z5.b, z2.b[0]\n" - "sdot z12.s, z5.b, z2.b[1]\n" - "sdot z15.s, z5.b, z2.b[2]\n" - "sdot z18.s, z5.b, z2.b[3]\n" - "sdot z21.s, z5.b, z3.b[0]\n" - "sdot z24.s, z5.b, z3.b[1]\n" - "sdot z27.s, z5.b, z3.b[2]\n" - "sdot z30.s, z5.b, z3.b[3]\n" + "sdot z9.s, z5.b, z3.b[0]\n" + "sdot z12.s, z5.b, z3.b[1]\n" + "sdot z15.s, z5.b, z3.b[2]\n" + "sdot z18.s, z5.b, z3.b[3]\n" + "sdot z21.s, z5.b, z7.b[0]\n" + "sdot z24.s, z5.b, z7.b[1]\n" + "sdot z27.s, z5.b, z7.b[2]\n" + "sdot z30.s, z5.b, z7.b[3]\n" "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n" - "sdot z10.s, z6.b, z2.b[0]\n" - "sdot z13.s, z6.b, z2.b[1]\n" - "sdot z16.s, z6.b, z2.b[2]\n" - "sdot z19.s, z6.b, z2.b[3]\n" - "sdot z22.s, z6.b, z3.b[0]\n" - "sdot z25.s, z6.b, z3.b[1]\n" - "sdot z28.s, z6.b, z3.b[2]\n" - "sdot z31.s, z6.b, z3.b[3]\n" + "sdot z10.s, z2.b, z3.b[0]\n" + "sdot z13.s, z2.b, z3.b[1]\n" + "sdot z16.s, z2.b, z3.b[2]\n" + "sdot z19.s, z2.b, z3.b[3]\n" + "sdot z22.s, z2.b, z7.b[0]\n" + "sdot z25.s, z2.b, z7.b[1]\n" + "sdot z28.s, z2.b, z7.b[2]\n" + "sdot z31.s, z2.b, z7.b[3]\n" "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n" "bge 3b\n" "4:" // main loop skip @@ -174,37 +178,37 @@ void sve_interleaved_s8s32_dot_8x3VL( "sdot z28.s, z6.b, z1.b[2]\n" "sdot z31.s, z6.b, z1.b[3]\n" "cbz x20, 5f\n" - "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n" - "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1b { z7.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n" - "sdot z8.s, z7.b, z0.b[0]\n" - "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" - "sdot z11.s, z7.b, z0.b[1]\n" - "sdot z14.s, z7.b, z0.b[2]\n" - "sdot z17.s, z7.b, z0.b[3]\n" - "sdot z20.s, z7.b, z1.b[0]\n" + "ld1b { z2.b }, p0/Z, [x22]\n" + "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" + "sdot z8.s, z2.b, z4.b[0]\n" + "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n" + "sdot z11.s, z2.b, z4.b[1]\n" + "sdot z14.s, z2.b, z4.b[2]\n" + "sdot z17.s, z2.b, z4.b[3]\n" + "sdot z20.s, z2.b, z3.b[0]\n" "addvl x22, x22, #3\n" - "sdot z23.s, z7.b, z1.b[1]\n" - "sdot z26.s, z7.b, z1.b[2]\n" - "sdot z29.s, z7.b, z1.b[3]\n" - "sdot z9.s, z4.b, z0.b[0]\n" - "sdot z12.s, z4.b, z0.b[1]\n" - "sdot z15.s, z4.b, z0.b[2]\n" - "sdot z18.s, z4.b, z0.b[3]\n" - "sdot z21.s, z4.b, z1.b[0]\n" - "sdot z24.s, z4.b, z1.b[1]\n" - "sdot z27.s, z4.b, z1.b[2]\n" - "sdot z30.s, z4.b, z1.b[3]\n" - "sdot z10.s, z5.b, z0.b[0]\n" - "sdot z13.s, z5.b, z0.b[1]\n" - "sdot z16.s, z5.b, z0.b[2]\n" - "sdot z19.s, z5.b, z0.b[3]\n" - "sdot z22.s, z5.b, z1.b[0]\n" - "sdot z25.s, z5.b, z1.b[1]\n" - "sdot z28.s, z5.b, z1.b[2]\n" - "sdot z31.s, z5.b, z1.b[3]\n" + "sdot z23.s, z2.b, z3.b[1]\n" + "sdot z26.s, z2.b, z3.b[2]\n" + "sdot z29.s, z2.b, z3.b[3]\n" + "sdot z9.s, z1.b, z4.b[0]\n" + "sdot z12.s, z1.b, z4.b[1]\n" + "sdot z15.s, z1.b, z4.b[2]\n" + "sdot z18.s, z1.b, z4.b[3]\n" + "sdot z21.s, z1.b, z3.b[0]\n" + "sdot z24.s, z1.b, z3.b[1]\n" + "sdot z27.s, z1.b, z3.b[2]\n" + "sdot z30.s, z1.b, z3.b[3]\n" + "sdot z10.s, z0.b, z4.b[0]\n" + "sdot z13.s, z0.b, z4.b[1]\n" + "sdot z16.s, z0.b, z4.b[2]\n" + "sdot z19.s, z0.b, z4.b[3]\n" + "sdot z22.s, z0.b, z3.b[0]\n" + "sdot z25.s, z0.b, z3.b[1]\n" + "sdot z28.s, z0.b, z3.b[2]\n" + "sdot z31.s, z0.b, z3.b[3]\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp index 4e65296f8b..82734abfbe 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -55,11 +55,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 8; @@ -89,7 +84,7 @@ public: default: return { 61.97, 3.64, 0.50 }; case CPUModel::V1: - return { 95.28, 7.99, 0.79 }; + return { 95.28, 7.99, 0.79 }; case CPUModel::A510: return { 43.36, 1.86, 0.28 }; } @@ -108,5 +103,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp index 104d5f918e..bfed5000fc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_s8s32_mmla_8x3VL( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *Apanel, + const int8_t *Bpanel, + int32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,82 +89,82 @@ void sve_interleaved_s8s32_mmla_8x3VL( "mov z31.s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n" ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n" ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n" ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n" ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n" - "ld1b { z6.b }, p0/Z, [x22]\n" + "ld1b { z7.b }, p0/Z, [x22]\n" ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n" ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" - ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x45069809 // smmla z9.s, z0.b, z6.b\n" - ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n" - ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x450498da // smmla z26.s, z6.b, z4.b\n" + ".inst 0x450598dd // smmla z29.s, z6.b, z5.b\n" + "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n" + ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n" + ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n" + ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n" "sub x20, x20, #0x2\n" - ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n" + ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n" + ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n" "cmp x20, #0x2\n" - ".inst 0x4506987b // smmla z27.s, z3.b, z6.b\n" - ".inst 0x4507987e // smmla z30.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n" - ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n" - ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n" + ".inst 0x450798db // smmla z27.s, z6.b, z7.b\n" + ".inst 0x450398de // smmla z30.s, z6.b, z3.b\n" + "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n" + ".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n" + ".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n" "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" + ".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n" + ".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n" - ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n" - ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n" + ".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n" + ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n" "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n" - ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n" - "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n" - ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n" - "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n" + ".inst 0x450598dc // smmla z28.s, z6.b, z5.b\n" + ".inst 0x450498df // smmla z31.s, z6.b, z4.b\n" + "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n" + "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n" + ".inst 0x45039808 // smmla z8.s, z0.b, z3.b\n" + "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n" "addvl x22, x22, #16\n" ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" + ".inst 0x4503982e // smmla z14.s, z1.b, z3.b\n" ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n" - ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n" - ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n" + ".inst 0x450398b4 // smmla z20.s, z5.b, z3.b\n" + ".inst 0x450798b7 // smmla z23.s, z5.b, z7.b\n" + ".inst 0x450398da // smmla z26.s, z6.b, z3.b\n" + ".inst 0x450798dd // smmla z29.s, z6.b, z7.b\n" + "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n" "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n" - ".inst 0x45049809 // smmla z9.s, z0.b, z4.b\n" - ".inst 0x4505980c // smmla z12.s, z0.b, z5.b\n" - ".inst 0x4504982f // smmla z15.s, z1.b, z4.b\n" - ".inst 0x45059832 // smmla z18.s, z1.b, z5.b\n" - ".inst 0x45049855 // smmla z21.s, z2.b, z4.b\n" - ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - ".inst 0x4504987b // smmla z27.s, z3.b, z4.b\n" - ".inst 0x4505987e // smmla z30.s, z3.b, z5.b\n" + ".inst 0x45029809 // smmla z9.s, z0.b, z2.b\n" + ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n" + ".inst 0x4502982f // smmla z15.s, z1.b, z2.b\n" + ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n" + ".inst 0x450298b5 // smmla z21.s, z5.b, z2.b\n" + ".inst 0x450498b8 // smmla z24.s, z5.b, z4.b\n" + ".inst 0x450298db // smmla z27.s, z6.b, z2.b\n" + ".inst 0x450498de // smmla z30.s, z6.b, z4.b\n" "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n" - ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n" + ".inst 0x4503980a // smmla z10.s, z0.b, z3.b\n" ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n" "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" + ".inst 0x45039830 // smmla z16.s, z1.b, z3.b\n" ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" + ".inst 0x450398b6 // smmla z22.s, z5.b, z3.b\n" + ".inst 0x450798b9 // smmla z25.s, z5.b, z7.b\n" "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n" - ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n" - ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n" + ".inst 0x450398dc // smmla z28.s, z6.b, z3.b\n" + ".inst 0x450798df // smmla z31.s, z6.b, z7.b\n" "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n" "add %x[Apanel], %x[Apanel], #0x80\n" "addvl x22, x22, #-4\n" "bge 3b\n" "4:" // main loop skip - "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n" ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n" ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n" ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n" @@ -168,114 +172,114 @@ void sve_interleaved_s8s32_mmla_8x3VL( "ld1b { z6.b }, p0/Z, [x22]\n" ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n" ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" - ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" + "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x450498fa // smmla z26.s, z7.b, z4.b\n" + ".inst 0x450598fd // smmla z29.s, z7.b, z5.b\n" + "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n" ".inst 0x45069809 // smmla z9.s, z0.b, z6.b\n" - ".inst 0x4507980c // smmla z12.s, z0.b, z7.b\n" + ".inst 0x4503980c // smmla z12.s, z0.b, z3.b\n" ".inst 0x4506982f // smmla z15.s, z1.b, z6.b\n" - ".inst 0x45079832 // smmla z18.s, z1.b, z7.b\n" + ".inst 0x45039832 // smmla z18.s, z1.b, z3.b\n" "add %x[Apanel], %x[Apanel], #0x10\n" ".inst 0x45069855 // smmla z21.s, z2.b, z6.b\n" - ".inst 0x45079858 // smmla z24.s, z2.b, z7.b\n" + ".inst 0x45039858 // smmla z24.s, z2.b, z3.b\n" "addvl x22, x22, #4\n" - ".inst 0x4506987b // smmla z27.s, z3.b, z6.b\n" - ".inst 0x4507987e // smmla z30.s, z3.b, z7.b\n" - ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n" - ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n" - ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n" - ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n" - ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n" - ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n" - ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n" - ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n" + ".inst 0x450698fb // smmla z27.s, z7.b, z6.b\n" + ".inst 0x450398fe // smmla z30.s, z7.b, z3.b\n" + ".inst 0x4505980a // smmla z10.s, z0.b, z5.b\n" + ".inst 0x4504980d // smmla z13.s, z0.b, z4.b\n" + ".inst 0x45059830 // smmla z16.s, z1.b, z5.b\n" + ".inst 0x45049833 // smmla z19.s, z1.b, z4.b\n" + ".inst 0x45059856 // smmla z22.s, z2.b, z5.b\n" + ".inst 0x45049859 // smmla z25.s, z2.b, z4.b\n" + ".inst 0x450598fc // smmla z28.s, z7.b, z5.b\n" + ".inst 0x450498ff // smmla z31.s, z7.b, z4.b\n" "cbz x20, 5f\n" - "ld1b { z6.b }, p0/Z, [x22]\n" - "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n" - ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n" - "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n" - ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n" - ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n" - ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n" - ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n" - "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x45049809 // smmla z9.s, z0.b, z4.b\n" - ".inst 0x4505980c // smmla z12.s, z0.b, z5.b\n" + "ld1b { z1.b }, p0/Z, [x22]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n" + ".inst 0x450198e8 // smmla z8.s, z7.b, z1.b\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n" + "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x450098eb // smmla z11.s, z7.b, z0.b\n" + "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n" + ".inst 0x450198ce // smmla z14.s, z6.b, z1.b\n" + ".inst 0x450098d1 // smmla z17.s, z6.b, z0.b\n" + ".inst 0x450198b4 // smmla z20.s, z5.b, z1.b\n" + "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x450098b7 // smmla z23.s, z5.b, z0.b\n" + ".inst 0x4501989a // smmla z26.s, z4.b, z1.b\n" + "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x4500989d // smmla z29.s, z4.b, z0.b\n" + "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n" + "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n" + ".inst 0x450398e9 // smmla z9.s, z7.b, z3.b\n" + ".inst 0x450298ec // smmla z12.s, z7.b, z2.b\n" "addvl x22, x22, #6\n" - ".inst 0x4504982f // smmla z15.s, z1.b, z4.b\n" - ".inst 0x45059832 // smmla z18.s, z1.b, z5.b\n" + ".inst 0x450398cf // smmla z15.s, z6.b, z3.b\n" + ".inst 0x450298d2 // smmla z18.s, z6.b, z2.b\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x45049855 // smmla z21.s, z2.b, z4.b\n" - ".inst 0x45059858 // smmla z24.s, z2.b, z5.b\n" - ".inst 0x4504987b // smmla z27.s, z3.b, z4.b\n" - ".inst 0x4505987e // smmla z30.s, z3.b, z5.b\n" - ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n" - ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n" - ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n" - ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n" - ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n" - ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" - ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n" - ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n" + ".inst 0x450398b5 // smmla z21.s, z5.b, z3.b\n" + ".inst 0x450298b8 // smmla z24.s, z5.b, z2.b\n" + ".inst 0x4503989b // smmla z27.s, z4.b, z3.b\n" + ".inst 0x4502989e // smmla z30.s, z4.b, z2.b\n" + ".inst 0x450198ea // smmla z10.s, z7.b, z1.b\n" + ".inst 0x450098ed // smmla z13.s, z7.b, z0.b\n" + ".inst 0x450198d0 // smmla z16.s, z6.b, z1.b\n" + ".inst 0x450098d3 // smmla z19.s, z6.b, z0.b\n" + ".inst 0x450198b6 // smmla z22.s, z5.b, z1.b\n" + ".inst 0x450098b9 // smmla z25.s, z5.b, z0.b\n" + ".inst 0x4501989c // smmla z28.s, z4.b, z1.b\n" + ".inst 0x4500989f // smmla z31.s, z4.b, z0.b\n" "5:" // multiply loop done - "uzp1 z4.d, z8.d, z11.d\n" + "uzp1 z0.d, z8.d, z11.d\n" "uzp2 z8.d, z8.d, z11.d\n" - "st1w { z4.s }, p0, [%x[Cpanel]]\n" - "uzp1 z11.d, z9.d, z12.d\n" + "st1w { z0.s }, p0, [%x[Cpanel]]\n" + "uzp1 z0.d, z9.d, z12.d\n" "uzp2 z9.d, z9.d, z12.d\n" - "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "uzp1 z12.d, z10.d, z13.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "uzp1 z0.d, z10.d, z13.d\n" "uzp2 z10.d, z10.d, z13.d\n" - "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "uzp1 z13.d, z14.d, z17.d\n" + "uzp1 z0.d, z14.d, z17.d\n" "uzp2 z14.d, z14.d, z17.d\n" "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n" - "uzp1 z17.d, z15.d, z18.d\n" + "uzp1 z1.d, z15.d, z18.d\n" "subs x23, x23, #0x1\n" "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "uzp2 z15.d, z15.d, z18.d\n" - "uzp1 z18.d, z16.d, z19.d\n" - "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "uzp1 z17.d, z16.d, z19.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "uzp2 z16.d, z16.d, z19.d\n" - "uzp1 z19.d, z20.d, z23.d\n" - "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "uzp1 z0.d, z20.d, z23.d\n" + "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n" "addvl %x[Cpanel], %x[Cpanel], #16\n" "uzp2 z20.d, z20.d, z23.d\n" - "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" "uzp1 z23.d, z21.d, z24.d\n" "uzp2 z21.d, z21.d, z24.d\n" "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" - "uzp1 z24.d, z22.d, z25.d\n" + "uzp1 z19.d, z22.d, z25.d\n" "uzp2 z22.d, z22.d, z25.d\n" "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" - "uzp1 z25.d, z26.d, z29.d\n" + "uzp1 z18.d, z26.d, z29.d\n" "uzp2 z26.d, z26.d, z29.d\n" "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" - "uzp1 z29.d, z27.d, z30.d\n" + "uzp1 z17.d, z27.d, z30.d\n" "uzp2 z27.d, z27.d, z30.d\n" - "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" - "uzp1 z30.d, z28.d, z31.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "uzp1 z16.d, z28.d, z31.d\n" "uzp2 z28.d, z28.d, z31.d\n" "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" - "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" "st1w { z21.s }, p0, [%x[Cpanel]]\n" "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n" - "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n" "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n" @@ -290,4 +294,4 @@ void sve_interleaved_s8s32_mmla_8x3VL( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp index 0afcdd2ce4..c0b215ccb4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -35,6 +35,7 @@ namespace arm_gemm { // Actual kernel implementations void sve_interleaved_u8u32_dot_8x3VL( ARGLIST ); +void sve_interleaved_u8u32_dot_8x3VL_a64fx( ARGLIST ); class cls_sve_interleaved_u8u32_dot_8x3VL { @@ -55,11 +56,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 4; @@ -80,6 +76,8 @@ public: return { 27.44, 3.41, 2.90 }; case CPUModel::V1: return { 63.30, 4.97, 11.52 }; + case CPUModel::A64FX: + return { 109.76, 3.88, 6.76 }; } } @@ -92,6 +90,8 @@ public: return { 27.45, 1.65, 0.28 }; case CPUModel::V1: return { 52.24, 7.49, 0.80 }; + case CPUModel::A64FX: + return { 110.18, 2.34, 0.40 }; } } @@ -100,13 +100,19 @@ public: // Default to the generic kernel kern_type kernel=sve_interleaved_u8u32_dot_8x3VL; - cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *) + cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *ci) { + switch(ci->get_cpu_model()) { + default: + break; + case CPUModel::A64FX: + kernel=sve_interleaved_u8u32_dot_8x3VL_a64fx; + break; + } } }; } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp index 2bfec8f350..79e794a834 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_u8u32_dot_8x3VL_a64fx( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *Apanel, + const uint8_t *Bpanel, + uint32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -89,7 +93,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( "udot z9.s, z1.b, z3.b\n" "sub x20, x20, #0x2\n" "udot z10.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "udot z11.s, z0.b, z4.b\n" "udot z12.s, z1.b, z4.b\n" "udot z13.s, z2.b, z4.b\n" @@ -98,63 +102,63 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( "udot z15.s, z1.b, z5.b\n" "cmp x20, #0x2\n" "udot z16.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #24]\n" "udot z17.s, z0.b, z6.b\n" "udot z18.s, z1.b, z6.b\n" "udot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "udot z20.s, z0.b, z3.b\n" - "udot z21.s, z1.b, z3.b\n" - "udot z22.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n" + "ld1rw { z5.s }, p0/Z, [%x[Apanel], #28]\n" + "udot z20.s, z0.b, z7.b\n" + "udot z21.s, z1.b, z7.b\n" + "udot z22.s, z2.b, z7.b\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #32]\n" "udot z23.s, z0.b, z4.b\n" "udot z24.s, z1.b, z4.b\n" "udot z25.s, z2.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n" - "udot z26.s, z0.b, z5.b\n" - "udot z27.s, z1.b, z5.b\n" - "udot z28.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n" - "udot z29.s, z0.b, z6.b\n" - "ld1b { z0.b }, p0/Z, [x22, #3, MUL VL]\n" - "udot z30.s, z1.b, z6.b\n" - "udot z31.s, z2.b, z6.b\n" - "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n" - "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n" - "udot z8.s, z0.b, z3.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n" - "udot z9.s, z1.b, z3.b\n" - "udot z10.s, z2.b, z3.b\n" - "udot z11.s, z0.b, z4.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n" - "udot z12.s, z1.b, z4.b\n" - "udot z13.s, z2.b, z4.b\n" + "udot z26.s, z0.b, z3.b\n" + "udot z27.s, z1.b, z3.b\n" + "udot z28.s, z2.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #40]\n" + "udot z29.s, z0.b, z5.b\n" + "ld1b { z6.b }, p0/Z, [x22, #3, MUL VL]\n" + "udot z30.s, z1.b, z5.b\n" + "udot z31.s, z2.b, z5.b\n" + "ld1b { z2.b }, p0/Z, [x22, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x22, #5, MUL VL]\n" + "udot z8.s, z6.b, z7.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #44]\n" + "udot z9.s, z2.b, z7.b\n" + "udot z10.s, z5.b, z7.b\n" + "udot z11.s, z6.b, z4.b\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #48]\n" + "udot z12.s, z2.b, z4.b\n" + "udot z13.s, z5.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n" - "udot z14.s, z0.b, z5.b\n" - "udot z15.s, z1.b, z5.b\n" + "udot z14.s, z6.b, z3.b\n" + "udot z15.s, z2.b, z3.b\n" "addvl x22, x22, #6\n" - "udot z16.s, z2.b, z5.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n" - "udot z17.s, z0.b, z6.b\n" - "udot z18.s, z1.b, z6.b\n" - "udot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n" + "udot z16.s, z5.b, z3.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #56]\n" + "udot z17.s, z6.b, z1.b\n" + "udot z18.s, z2.b, z1.b\n" + "udot z19.s, z5.b, z1.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #60]\n" "add %x[Apanel], %x[Apanel], #0x40\n" - "udot z20.s, z0.b, z3.b\n" - "udot z21.s, z1.b, z3.b\n" - "udot z22.s, z2.b, z3.b\n" - "udot z23.s, z0.b, z4.b\n" + "udot z20.s, z6.b, z7.b\n" + "udot z21.s, z2.b, z7.b\n" + "udot z22.s, z5.b, z7.b\n" + "udot z23.s, z6.b, z4.b\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "udot z24.s, z1.b, z4.b\n" - "udot z25.s, z2.b, z4.b\n" + "udot z24.s, z2.b, z4.b\n" + "udot z25.s, z5.b, z4.b\n" "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "udot z26.s, z0.b, z5.b\n" - "udot z27.s, z1.b, z5.b\n" - "udot z28.s, z2.b, z5.b\n" - "udot z29.s, z0.b, z6.b\n" + "udot z26.s, z6.b, z0.b\n" + "udot z27.s, z2.b, z0.b\n" + "udot z28.s, z5.b, z0.b\n" + "udot z29.s, z6.b, z1.b\n" "ld1b { z0.b }, p0/Z, [x22]\n" - "udot z30.s, z1.b, z6.b\n" - "udot z31.s, z2.b, z6.b\n" + "udot z30.s, z2.b, z1.b\n" + "udot z31.s, z5.b, z1.b\n" "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" @@ -165,7 +169,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( "udot z9.s, z1.b, z3.b\n" "addvl x22, x22, #3\n" "udot z10.s, z2.b, z3.b\n" - "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" + "ld1rw { z7.s }, p0/Z, [%x[Apanel], #16]\n" "udot z11.s, z0.b, z4.b\n" "udot z12.s, z1.b, z4.b\n" "udot z13.s, z2.b, z4.b\n" @@ -177,58 +181,58 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( "udot z17.s, z0.b, z6.b\n" "udot z18.s, z1.b, z6.b\n" "udot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "udot z20.s, z0.b, z3.b\n" - "udot z21.s, z1.b, z3.b\n" + "ld1rw { z3.s }, p0/Z, [%x[Apanel], #28]\n" + "udot z20.s, z0.b, z7.b\n" + "udot z21.s, z1.b, z7.b\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "udot z22.s, z2.b, z3.b\n" + "udot z22.s, z2.b, z7.b\n" "udot z23.s, z0.b, z4.b\n" "udot z24.s, z1.b, z4.b\n" "udot z25.s, z2.b, z4.b\n" "udot z26.s, z0.b, z5.b\n" "udot z27.s, z1.b, z5.b\n" "udot z28.s, z2.b, z5.b\n" - "udot z29.s, z0.b, z6.b\n" - "udot z30.s, z1.b, z6.b\n" - "udot z31.s, z2.b, z6.b\n" + "udot z29.s, z0.b, z3.b\n" + "udot z30.s, z1.b, z3.b\n" + "udot z31.s, z2.b, z3.b\n" "cbz x20, 5f\n" - "ld1b { z0.b }, p0/Z, [x22]\n" - "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" - "ld1b { z2.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z6.b }, p0/Z, [x22]\n" + "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n" - "udot z8.s, z0.b, z3.b\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n" - "udot z9.s, z1.b, z3.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n" - "udot z10.s, z2.b, z3.b\n" - "udot z11.s, z0.b, z4.b\n" - "udot z12.s, z1.b, z4.b\n" - "udot z13.s, z2.b, z4.b\n" + "udot z8.s, z6.b, z3.b\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #4]\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #8]\n" + "udot z9.s, z5.b, z3.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #12]\n" + "udot z10.s, z4.b, z3.b\n" + "udot z11.s, z6.b, z2.b\n" + "udot z12.s, z5.b, z2.b\n" + "udot z13.s, z4.b, z2.b\n" "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n" - "udot z14.s, z0.b, z5.b\n" - "udot z15.s, z1.b, z5.b\n" - "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n" - "udot z16.s, z2.b, z5.b\n" - "udot z17.s, z0.b, z6.b\n" - "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n" - "udot z18.s, z1.b, z6.b\n" - "udot z19.s, z2.b, z6.b\n" - "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n" - "udot z20.s, z0.b, z3.b\n" - "udot z21.s, z1.b, z3.b\n" + "udot z14.s, z6.b, z1.b\n" + "udot z15.s, z5.b, z1.b\n" + "ld1rw { z2.s }, p0/Z, [%x[Apanel], #20]\n" + "udot z16.s, z4.b, z1.b\n" + "udot z17.s, z6.b, z0.b\n" + "ld1rw { z1.s }, p0/Z, [%x[Apanel], #24]\n" + "udot z18.s, z5.b, z0.b\n" + "udot z19.s, z4.b, z0.b\n" + "ld1rw { z0.s }, p0/Z, [%x[Apanel], #28]\n" + "udot z20.s, z6.b, z3.b\n" + "udot z21.s, z5.b, z3.b\n" "addvl x22, x22, #3\n" - "udot z22.s, z2.b, z3.b\n" - "udot z23.s, z0.b, z4.b\n" + "udot z22.s, z4.b, z3.b\n" + "udot z23.s, z6.b, z2.b\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "udot z24.s, z1.b, z4.b\n" - "udot z25.s, z2.b, z4.b\n" - "udot z26.s, z0.b, z5.b\n" - "udot z27.s, z1.b, z5.b\n" - "udot z28.s, z2.b, z5.b\n" - "udot z29.s, z0.b, z6.b\n" - "udot z30.s, z1.b, z6.b\n" - "udot z31.s, z2.b, z6.b\n" + "udot z24.s, z5.b, z2.b\n" + "udot z25.s, z4.b, z2.b\n" + "udot z26.s, z6.b, z1.b\n" + "udot z27.s, z5.b, z1.b\n" + "udot z28.s, z4.b, z1.b\n" + "udot z29.s, z6.b, z0.b\n" + "udot z30.s, z5.b, z0.b\n" + "udot z31.s, z4.b, z0.b\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" @@ -262,7 +266,7 @@ void sve_interleaved_u8u32_dot_8x3VL_a64fx( "bne 1b\n" : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + : "cc", "memory", "p0", "x20", "x21", "x22", "x23", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp index 99fff4e83d..1c88336c2d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_u8u32_dot_8x3VL( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *Apanel, + const uint8_t *Bpanel, + uint32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,10 +89,10 @@ void sve_interleaved_u8u32_dot_8x3VL( "3:" // main loop head "udot z8.s, z4.b, z0.b[0]\n" "udot z11.s, z4.b, z0.b[1]\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #32]\n" "udot z14.s, z4.b, z0.b[2]\n" "udot z17.s, z4.b, z0.b[3]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel], #48]\n" "udot z20.s, z4.b, z1.b[0]\n" "udot z23.s, z4.b, z1.b[1]\n" "sub x20, x20, #0x2\n" @@ -115,35 +119,35 @@ void sve_interleaved_u8u32_dot_8x3VL( "udot z25.s, z6.b, z1.b[1]\n" "udot z28.s, z6.b, z1.b[2]\n" "udot z31.s, z6.b, z1.b[3]\n" - "ld1b { z6.b }, p0/Z, [x22, #5, MUL VL]\n" + "ld1b { z2.b }, p0/Z, [x22, #5, MUL VL]\n" "addvl x22, x22, #6\n" - "udot z8.s, z4.b, z2.b[0]\n" - "udot z11.s, z4.b, z2.b[1]\n" + "udot z8.s, z4.b, z3.b[0]\n" + "udot z11.s, z4.b, z3.b[1]\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" - "udot z14.s, z4.b, z2.b[2]\n" - "udot z17.s, z4.b, z2.b[3]\n" - "udot z20.s, z4.b, z3.b[0]\n" - "udot z23.s, z4.b, z3.b[1]\n" - "udot z26.s, z4.b, z3.b[2]\n" - "udot z29.s, z4.b, z3.b[3]\n" + "udot z14.s, z4.b, z3.b[2]\n" + "udot z17.s, z4.b, z3.b[3]\n" + "udot z20.s, z4.b, z7.b[0]\n" + "udot z23.s, z4.b, z7.b[1]\n" + "udot z26.s, z4.b, z7.b[2]\n" + "udot z29.s, z4.b, z7.b[3]\n" "ld1b { z4.b }, p0/Z, [x22]\n" - "udot z9.s, z5.b, z2.b[0]\n" - "udot z12.s, z5.b, z2.b[1]\n" - "udot z15.s, z5.b, z2.b[2]\n" - "udot z18.s, z5.b, z2.b[3]\n" - "udot z21.s, z5.b, z3.b[0]\n" - "udot z24.s, z5.b, z3.b[1]\n" - "udot z27.s, z5.b, z3.b[2]\n" - "udot z30.s, z5.b, z3.b[3]\n" + "udot z9.s, z5.b, z3.b[0]\n" + "udot z12.s, z5.b, z3.b[1]\n" + "udot z15.s, z5.b, z3.b[2]\n" + "udot z18.s, z5.b, z3.b[3]\n" + "udot z21.s, z5.b, z7.b[0]\n" + "udot z24.s, z5.b, z7.b[1]\n" + "udot z27.s, z5.b, z7.b[2]\n" + "udot z30.s, z5.b, z7.b[3]\n" "ld1b { z5.b }, p0/Z, [x22, #1, MUL VL]\n" - "udot z10.s, z6.b, z2.b[0]\n" - "udot z13.s, z6.b, z2.b[1]\n" - "udot z16.s, z6.b, z2.b[2]\n" - "udot z19.s, z6.b, z2.b[3]\n" - "udot z22.s, z6.b, z3.b[0]\n" - "udot z25.s, z6.b, z3.b[1]\n" - "udot z28.s, z6.b, z3.b[2]\n" - "udot z31.s, z6.b, z3.b[3]\n" + "udot z10.s, z2.b, z3.b[0]\n" + "udot z13.s, z2.b, z3.b[1]\n" + "udot z16.s, z2.b, z3.b[2]\n" + "udot z19.s, z2.b, z3.b[3]\n" + "udot z22.s, z2.b, z7.b[0]\n" + "udot z25.s, z2.b, z7.b[1]\n" + "udot z28.s, z2.b, z7.b[2]\n" + "udot z31.s, z2.b, z7.b[3]\n" "ld1b { z6.b }, p0/Z, [x22, #2, MUL VL]\n" "bge 3b\n" "4:" // main loop skip @@ -174,37 +178,37 @@ void sve_interleaved_u8u32_dot_8x3VL( "udot z28.s, z6.b, z1.b[2]\n" "udot z31.s, z6.b, z1.b[3]\n" "cbz x20, 5f\n" - "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n" - "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" + "ld1rqb { z4.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #16]\n" "add %x[Apanel], %x[Apanel], #0x20\n" - "ld1b { z7.b }, p0/Z, [x22]\n" - "ld1b { z4.b }, p0/Z, [x22, #1, MUL VL]\n" - "udot z8.s, z7.b, z0.b[0]\n" - "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" - "udot z11.s, z7.b, z0.b[1]\n" - "udot z14.s, z7.b, z0.b[2]\n" - "udot z17.s, z7.b, z0.b[3]\n" - "udot z20.s, z7.b, z1.b[0]\n" + "ld1b { z2.b }, p0/Z, [x22]\n" + "ld1b { z1.b }, p0/Z, [x22, #1, MUL VL]\n" + "udot z8.s, z2.b, z4.b[0]\n" + "ld1b { z0.b }, p0/Z, [x22, #2, MUL VL]\n" + "udot z11.s, z2.b, z4.b[1]\n" + "udot z14.s, z2.b, z4.b[2]\n" + "udot z17.s, z2.b, z4.b[3]\n" + "udot z20.s, z2.b, z3.b[0]\n" "addvl x22, x22, #3\n" - "udot z23.s, z7.b, z1.b[1]\n" - "udot z26.s, z7.b, z1.b[2]\n" - "udot z29.s, z7.b, z1.b[3]\n" - "udot z9.s, z4.b, z0.b[0]\n" - "udot z12.s, z4.b, z0.b[1]\n" - "udot z15.s, z4.b, z0.b[2]\n" - "udot z18.s, z4.b, z0.b[3]\n" - "udot z21.s, z4.b, z1.b[0]\n" - "udot z24.s, z4.b, z1.b[1]\n" - "udot z27.s, z4.b, z1.b[2]\n" - "udot z30.s, z4.b, z1.b[3]\n" - "udot z10.s, z5.b, z0.b[0]\n" - "udot z13.s, z5.b, z0.b[1]\n" - "udot z16.s, z5.b, z0.b[2]\n" - "udot z19.s, z5.b, z0.b[3]\n" - "udot z22.s, z5.b, z1.b[0]\n" - "udot z25.s, z5.b, z1.b[1]\n" - "udot z28.s, z5.b, z1.b[2]\n" - "udot z31.s, z5.b, z1.b[3]\n" + "udot z23.s, z2.b, z3.b[1]\n" + "udot z26.s, z2.b, z3.b[2]\n" + "udot z29.s, z2.b, z3.b[3]\n" + "udot z9.s, z1.b, z4.b[0]\n" + "udot z12.s, z1.b, z4.b[1]\n" + "udot z15.s, z1.b, z4.b[2]\n" + "udot z18.s, z1.b, z4.b[3]\n" + "udot z21.s, z1.b, z3.b[0]\n" + "udot z24.s, z1.b, z3.b[1]\n" + "udot z27.s, z1.b, z3.b[2]\n" + "udot z30.s, z1.b, z3.b[3]\n" + "udot z10.s, z0.b, z4.b[0]\n" + "udot z13.s, z0.b, z4.b[1]\n" + "udot z16.s, z0.b, z4.b[2]\n" + "udot z19.s, z0.b, z4.b[3]\n" + "udot z22.s, z0.b, z3.b[0]\n" + "udot z25.s, z0.b, z3.b[1]\n" + "udot z28.s, z0.b, z3.b[2]\n" + "udot z31.s, z0.b, z3.b[3]\n" "5:" // multiply loop done "st1w { z8.s }, p0, [%x[Cpanel]]\n" "subs x23, x23, #0x1\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp index 58d21d6c40..067d0bf258 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -10,20 +10,20 @@ * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. */ #pragma once - #ifdef ARM_COMPUTE_ENABLE_SVE + #include "../std_transforms_sve.hpp" #include "../performance_parameters.hpp" @@ -55,11 +55,6 @@ public: return get_vector_length() * 3; } - static unsigned int stripe_width() - { - return get_vector_length(); - } - static constexpr unsigned int k_unroll() { return 8; @@ -108,5 +103,4 @@ public: } // namespace arm_gemm #undef ARGLIST - #endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp index 0b70d034dd..28449ea99b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp @@ -29,8 +29,12 @@ namespace arm_gemm { void sve_interleaved_u8u32_mmla_8x3VL( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *Apanel, + const uint8_t *Bpanel, + uint32_t *Cpanel, + int ablocks, + int bblocks, + int K) { struct KernelArgs { size_t K = {}; @@ -85,82 +89,82 @@ void sve_interleaved_u8u32_mmla_8x3VL( "mov z31.s, #0x0\n" "blt 4f\n" "3:" // main loop head - "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel]]\n" ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n" ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n" ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n" ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n" - "ld1b { z6.b }, p0/Z, [x22]\n" + "ld1b { z7.b }, p0/Z, [x22]\n" ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n" ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" - ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x45c69809 // ummla z9.s, z0.b, z6.b\n" - ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n" - ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x45c498da // ummla z26.s, z6.b, z4.b\n" + ".inst 0x45c598dd // ummla z29.s, z6.b, z5.b\n" + "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n" + ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n" + ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n" + ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n" "sub x20, x20, #0x2\n" - ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n" + ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n" + ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n" "cmp x20, #0x2\n" - ".inst 0x45c6987b // ummla z27.s, z3.b, z6.b\n" - ".inst 0x45c7987e // ummla z30.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n" - ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n" - ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n" + ".inst 0x45c798db // ummla z27.s, z6.b, z7.b\n" + ".inst 0x45c398de // ummla z30.s, z6.b, z3.b\n" + "ld1b { z3.b }, p0/Z, [x22, #4, MUL VL]\n" + ".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n" + ".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n" "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" + ".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n" + ".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n" - ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n" - ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n" + ".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n" + ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n" "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n" - ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n" - "ld1b { z4.b }, p0/Z, [x22, #6, MUL VL]\n" - ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n" - "ld1b { z5.b }, p0/Z, [x22, #7, MUL VL]\n" + ".inst 0x45c598dc // ummla z28.s, z6.b, z5.b\n" + ".inst 0x45c498df // ummla z31.s, z6.b, z4.b\n" + "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #48]\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #64]\n" + "ld1b { z2.b }, p0/Z, [x22, #6, MUL VL]\n" + ".inst 0x45c39808 // ummla z8.s, z0.b, z3.b\n" + "ld1b { z4.b }, p0/Z, [x22, #7, MUL VL]\n" "addvl x22, x22, #16\n" ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" + ".inst 0x45c3982e // ummla z14.s, z1.b, z3.b\n" ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n" - ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n" - ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #-8, MUL VL]\n" + ".inst 0x45c398b4 // ummla z20.s, z5.b, z3.b\n" + ".inst 0x45c798b7 // ummla z23.s, z5.b, z7.b\n" + ".inst 0x45c398da // ummla z26.s, z6.b, z3.b\n" + ".inst 0x45c798dd // ummla z29.s, z6.b, z7.b\n" + "ld1b { z3.b }, p0/Z, [x22, #-8, MUL VL]\n" "ld1b { z7.b }, p0/Z, [x22, #-7, MUL VL]\n" - ".inst 0x45c49809 // ummla z9.s, z0.b, z4.b\n" - ".inst 0x45c5980c // ummla z12.s, z0.b, z5.b\n" - ".inst 0x45c4982f // ummla z15.s, z1.b, z4.b\n" - ".inst 0x45c59832 // ummla z18.s, z1.b, z5.b\n" - ".inst 0x45c49855 // ummla z21.s, z2.b, z4.b\n" - ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - ".inst 0x45c4987b // ummla z27.s, z3.b, z4.b\n" - ".inst 0x45c5987e // ummla z30.s, z3.b, z5.b\n" + ".inst 0x45c29809 // ummla z9.s, z0.b, z2.b\n" + ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n" + ".inst 0x45c2982f // ummla z15.s, z1.b, z2.b\n" + ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n" + ".inst 0x45c298b5 // ummla z21.s, z5.b, z2.b\n" + ".inst 0x45c498b8 // ummla z24.s, z5.b, z4.b\n" + ".inst 0x45c298db // ummla z27.s, z6.b, z2.b\n" + ".inst 0x45c498de // ummla z30.s, z6.b, z4.b\n" "ld1b { z4.b }, p0/Z, [x22, #-6, MUL VL]\n" - ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n" + ".inst 0x45c3980a // ummla z10.s, z0.b, z3.b\n" ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n" "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" + ".inst 0x45c39830 // ummla z16.s, z1.b, z3.b\n" ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n" "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" + ".inst 0x45c398b6 // ummla z22.s, z5.b, z3.b\n" + ".inst 0x45c798b9 // ummla z25.s, z5.b, z7.b\n" "ld1b { z5.b }, p0/Z, [x22, #-5, MUL VL]\n" - ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n" - ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n" + ".inst 0x45c398dc // ummla z28.s, z6.b, z3.b\n" + ".inst 0x45c798df // ummla z31.s, z6.b, z7.b\n" "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n" "add %x[Apanel], %x[Apanel], #0x80\n" "addvl x22, x22, #-4\n" "bge 3b\n" "4:" // main loop skip - "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n" ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n" ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n" ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n" @@ -168,114 +172,114 @@ void sve_interleaved_u8u32_mmla_8x3VL( "ld1b { z6.b }, p0/Z, [x22]\n" ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n" ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" - ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" + "ld1b { z3.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x45c498fa // ummla z26.s, z7.b, z4.b\n" + ".inst 0x45c598fd // ummla z29.s, z7.b, z5.b\n" + "ld1b { z5.b }, p0/Z, [x22, #2, MUL VL]\n" + "ld1b { z4.b }, p0/Z, [x22, #3, MUL VL]\n" ".inst 0x45c69809 // ummla z9.s, z0.b, z6.b\n" - ".inst 0x45c7980c // ummla z12.s, z0.b, z7.b\n" + ".inst 0x45c3980c // ummla z12.s, z0.b, z3.b\n" ".inst 0x45c6982f // ummla z15.s, z1.b, z6.b\n" - ".inst 0x45c79832 // ummla z18.s, z1.b, z7.b\n" + ".inst 0x45c39832 // ummla z18.s, z1.b, z3.b\n" "add %x[Apanel], %x[Apanel], #0x10\n" ".inst 0x45c69855 // ummla z21.s, z2.b, z6.b\n" - ".inst 0x45c79858 // ummla z24.s, z2.b, z7.b\n" + ".inst 0x45c39858 // ummla z24.s, z2.b, z3.b\n" "addvl x22, x22, #4\n" - ".inst 0x45c6987b // ummla z27.s, z3.b, z6.b\n" - ".inst 0x45c7987e // ummla z30.s, z3.b, z7.b\n" - ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n" - ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n" - ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n" - ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n" - ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n" - ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n" - ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n" - ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n" + ".inst 0x45c698fb // ummla z27.s, z7.b, z6.b\n" + ".inst 0x45c398fe // ummla z30.s, z7.b, z3.b\n" + ".inst 0x45c5980a // ummla z10.s, z0.b, z5.b\n" + ".inst 0x45c4980d // ummla z13.s, z0.b, z4.b\n" + ".inst 0x45c59830 // ummla z16.s, z1.b, z5.b\n" + ".inst 0x45c49833 // ummla z19.s, z1.b, z4.b\n" + ".inst 0x45c59856 // ummla z22.s, z2.b, z5.b\n" + ".inst 0x45c49859 // ummla z25.s, z2.b, z4.b\n" + ".inst 0x45c598fc // ummla z28.s, z7.b, z5.b\n" + ".inst 0x45c498ff // ummla z31.s, z7.b, z4.b\n" "cbz x20, 5f\n" - "ld1b { z6.b }, p0/Z, [x22]\n" - "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n" - ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n" - "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n" - "ld1b { z7.b }, p0/Z, [x22, #1, MUL VL]\n" - ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n" - "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n" - "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n" - ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n" - ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n" - ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n" - "ld1b { z4.b }, p0/Z, [x22, #2, MUL VL]\n" - ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n" - ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n" - "ld1b { z5.b }, p0/Z, [x22, #3, MUL VL]\n" - ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n" - "ld1b { z6.b }, p0/Z, [x22, #4, MUL VL]\n" - "ld1b { z7.b }, p0/Z, [x22, #5, MUL VL]\n" - ".inst 0x45c49809 // ummla z9.s, z0.b, z4.b\n" - ".inst 0x45c5980c // ummla z12.s, z0.b, z5.b\n" + "ld1b { z1.b }, p0/Z, [x22]\n" + "ld1rqb { z7.b }, p0/Z, [%x[Apanel]]\n" + ".inst 0x45c198e8 // ummla z8.s, z7.b, z1.b\n" + "ld1rqb { z6.b }, p0/Z, [%x[Apanel], #16]\n" + "ld1b { z0.b }, p0/Z, [x22, #1, MUL VL]\n" + ".inst 0x45c098eb // ummla z11.s, z7.b, z0.b\n" + "ld1rqb { z5.b }, p0/Z, [%x[Apanel], #32]\n" + "ld1rqb { z4.b }, p0/Z, [%x[Apanel], #48]\n" + ".inst 0x45c198ce // ummla z14.s, z6.b, z1.b\n" + ".inst 0x45c098d1 // ummla z17.s, z6.b, z0.b\n" + ".inst 0x45c198b4 // ummla z20.s, z5.b, z1.b\n" + "ld1b { z3.b }, p0/Z, [x22, #2, MUL VL]\n" + ".inst 0x45c098b7 // ummla z23.s, z5.b, z0.b\n" + ".inst 0x45c1989a // ummla z26.s, z4.b, z1.b\n" + "ld1b { z2.b }, p0/Z, [x22, #3, MUL VL]\n" + ".inst 0x45c0989d // ummla z29.s, z4.b, z0.b\n" + "ld1b { z1.b }, p0/Z, [x22, #4, MUL VL]\n" + "ld1b { z0.b }, p0/Z, [x22, #5, MUL VL]\n" + ".inst 0x45c398e9 // ummla z9.s, z7.b, z3.b\n" + ".inst 0x45c298ec // ummla z12.s, z7.b, z2.b\n" "addvl x22, x22, #6\n" - ".inst 0x45c4982f // ummla z15.s, z1.b, z4.b\n" - ".inst 0x45c59832 // ummla z18.s, z1.b, z5.b\n" + ".inst 0x45c398cf // ummla z15.s, z6.b, z3.b\n" + ".inst 0x45c298d2 // ummla z18.s, z6.b, z2.b\n" "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x45c49855 // ummla z21.s, z2.b, z4.b\n" - ".inst 0x45c59858 // ummla z24.s, z2.b, z5.b\n" - ".inst 0x45c4987b // ummla z27.s, z3.b, z4.b\n" - ".inst 0x45c5987e // ummla z30.s, z3.b, z5.b\n" - ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n" - ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n" - ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n" - ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n" - ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n" - ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" - ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n" - ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n" + ".inst 0x45c398b5 // ummla z21.s, z5.b, z3.b\n" + ".inst 0x45c298b8 // ummla z24.s, z5.b, z2.b\n" + ".inst 0x45c3989b // ummla z27.s, z4.b, z3.b\n" + ".inst 0x45c2989e // ummla z30.s, z4.b, z2.b\n" + ".inst 0x45c198ea // ummla z10.s, z7.b, z1.b\n" + ".inst 0x45c098ed // ummla z13.s, z7.b, z0.b\n" + ".inst 0x45c198d0 // ummla z16.s, z6.b, z1.b\n" + ".inst 0x45c098d3 // ummla z19.s, z6.b, z0.b\n" + ".inst 0x45c198b6 // ummla z22.s, z5.b, z1.b\n" + ".inst 0x45c098b9 // ummla z25.s, z5.b, z0.b\n" + ".inst 0x45c1989c // ummla z28.s, z4.b, z1.b\n" + ".inst 0x45c0989f // ummla z31.s, z4.b, z0.b\n" "5:" // multiply loop done - "uzp1 z4.d, z8.d, z11.d\n" + "uzp1 z0.d, z8.d, z11.d\n" "uzp2 z8.d, z8.d, z11.d\n" - "st1w { z4.s }, p0, [%x[Cpanel]]\n" - "uzp1 z11.d, z9.d, z12.d\n" + "st1w { z0.s }, p0, [%x[Cpanel]]\n" + "uzp1 z0.d, z9.d, z12.d\n" "uzp2 z9.d, z9.d, z12.d\n" - "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "uzp1 z12.d, z10.d, z13.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #1, MUL VL]\n" + "uzp1 z0.d, z10.d, z13.d\n" "uzp2 z10.d, z10.d, z13.d\n" - "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z0.s }, p0, [%x[Cpanel], #2, MUL VL]\n" "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "uzp1 z13.d, z14.d, z17.d\n" + "uzp1 z0.d, z14.d, z17.d\n" "uzp2 z14.d, z14.d, z17.d\n" "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n" - "uzp1 z17.d, z15.d, z18.d\n" + "uzp1 z1.d, z15.d, z18.d\n" "subs x23, x23, #0x1\n" "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "uzp2 z15.d, z15.d, z18.d\n" - "uzp1 z18.d, z16.d, z19.d\n" - "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n" + "uzp1 z17.d, z16.d, z19.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "uzp2 z16.d, z16.d, z19.d\n" - "uzp1 z19.d, z20.d, z23.d\n" - "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n" + "uzp1 z0.d, z20.d, z23.d\n" + "st1w { z1.s }, p0, [%x[Cpanel], #7, MUL VL]\n" "addvl %x[Cpanel], %x[Cpanel], #16\n" "uzp2 z20.d, z20.d, z23.d\n" - "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #-8, MUL VL]\n" "uzp1 z23.d, z21.d, z24.d\n" "uzp2 z21.d, z21.d, z24.d\n" "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n" - "uzp1 z24.d, z22.d, z25.d\n" + "uzp1 z19.d, z22.d, z25.d\n" "uzp2 z22.d, z22.d, z25.d\n" "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n" - "uzp1 z25.d, z26.d, z29.d\n" + "uzp1 z18.d, z26.d, z29.d\n" "uzp2 z26.d, z26.d, z29.d\n" "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n" - "uzp1 z29.d, z27.d, z30.d\n" + "uzp1 z17.d, z27.d, z30.d\n" "uzp2 z27.d, z27.d, z30.d\n" - "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" - "uzp1 z30.d, z28.d, z31.d\n" + "st1w { z0.s }, p0, [%x[Cpanel], #-4, MUL VL]\n" + "uzp1 z16.d, z28.d, z31.d\n" "uzp2 z28.d, z28.d, z31.d\n" "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n" - "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" + "st1w { z19.s }, p0, [%x[Cpanel], #-2, MUL VL]\n" "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n" "st1w { z21.s }, p0, [%x[Cpanel]]\n" "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n" - "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n" - "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n" - "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n" + "st1w { z18.s }, p0, [%x[Cpanel], #2, MUL VL]\n" + "st1w { z17.s }, p0, [%x[Cpanel], #3, MUL VL]\n" + "st1w { z16.s }, p0, [%x[Cpanel], #4, MUL VL]\n" "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n" "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n" "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n" @@ -290,4 +294,4 @@ void sve_interleaved_u8u32_mmla_8x3VL( } } // namespace arm_gemm -#endif // __ARM_FEATURE_SVE +#endif // ARM_COMPUTE_ENABLE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp index cf99bbdb46..87310d996d 100644 --- a/src/core/NEON/kernels/arm_gemm/misc.cpp +++ b/src/core/NEON/kernels/arm_gemm/misc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, 2022 Arm Limited. + * Copyright (c) 2017-2018, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -56,10 +56,14 @@ WeightFormat get_weight_format(const KernelWeightFormat kwf, size_t element_size wf_i |= 0x10; } +#ifdef ARM_COMPUTE_ENABLE_SVE // Get total bytes in vector output if (kwf_i & 0x1) { vector_bytes = vector_count * get_vector_length(); } else { +#else + if (1) { +#endif vector_bytes = vector_count * 16; } diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp index 3f3443025c..31dd65b397 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.hpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,7 +42,7 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h unsigned int multi, unsigned int first_col); template -void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, +void row_sums_indirect(size_t num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp index 7345793f93..94cd7ddfeb 100644 --- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp +++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp @@ -34,7 +34,7 @@ namespace arm_gemm { template<> void row_sums_indirect( - unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, size_t M, int32_t *out_ptr, const Requantize32 *qp ) { diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp index ff95507d79..2ab0397fda 100644 --- a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp +++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp @@ -34,7 +34,7 @@ namespace arm_gemm { template<> void row_sums_indirect( - unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, + size_t num_strings, const unsigned int *string_lengths, IndirectInputArg A_arg, size_t M, int32_t *out_ptr, const Requantize32 *qp ) { diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp index ae452e1184..afe24e7ce0 100644 --- a/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp +++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sme.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,9 +67,8 @@ public: } template - void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool accumulate) { + void Merge(TOut *, const TResult *, int, int, int, int, int, const TOut *, const Activation, bool) { // Separate merge not supported for SME. - ARM_COMPUTE_UNUSED(out, in, stride, y0, ymax, x0, xmax, bias, act, accumulate); } }; diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp index ef5a01a578..5aa62f0fe4 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.cpp +++ b/src/core/NEON/kernels/arm_gemm/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -132,7 +132,9 @@ template void Transform<8, 1, true, VLType::None>(float *, const float *, int, i #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int); #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#ifdef ARM_COMPUTE_ENABLE_BF16 template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int); +#endif #endif // AArch32 } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp index e6186984e8..8574d89226 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -193,7 +193,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -264,7 +263,6 @@ void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t widt "add %x[out], %x[out], #0x80\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -286,4 +284,5 @@ void Transform<32, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp index 6d97f71c7d..cdf1f98608 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -427,4 +427,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp index 96d132b74f..da0809d4d6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 12 * roundup(height, 8) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -332,4 +331,5 @@ void Transform<12, 8, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp index 04af6fd713..cef468e9cc 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -236,7 +236,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "mov x20, %x[width]\n" @@ -319,7 +318,6 @@ void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -341,4 +339,5 @@ void Transform<12, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp index e6ddc10e04..4c02d0534d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -276,7 +276,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -420,7 +419,6 @@ void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x60\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -442,4 +440,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp index e487d4d839..2a3208d18d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -710,7 +710,6 @@ void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0x60\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -731,4 +730,5 @@ void Transform<12, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp index 7938325fa4..4d9d5e7f43 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -182,7 +182,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,7 +250,6 @@ void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t wi "add %x[out], %x[out], #0x18\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp index 4c66fb2c2f..b0cd7e4ef7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -182,7 +182,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,7 +250,6 @@ void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t "add %x[out], %x[out], #0x18\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -272,4 +270,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp index f06c167361..0399f8becc 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -137,4 +137,5 @@ void Transform<4, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp index e0ccb368c2..f3a1dde73f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -327,4 +327,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp index fa45f4fd4d..7c7e91e666 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 16 * roundup(height, 8) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -288,4 +287,5 @@ void Transform<16, 8, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp index 06efa9781e..b4515cbfd4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -163,7 +163,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 16f\n" "8:" // Main loop skip - "9:" // Tail row loop: Head "mov x9, %x[in]\n" "mov x20, %x[width]\n" @@ -221,7 +220,6 @@ void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x40\n" "bge 9b\n" "16:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -243,4 +241,5 @@ void Transform<16, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp index dafa53eec3..ac67467240 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -320,7 +320,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -486,7 +485,6 @@ void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x80\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -508,4 +506,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp index e012d0920f..b9fe8b126a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -281,7 +281,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si "bge 1b\n" "cbz %x[height], 16f\n" "8:" // Main loop skip - "9:" // Tail row loop: Head "mov x9, %x[in]\n" "add x28, x9, %x[in_stride]\n" @@ -423,7 +422,6 @@ void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0x80\n" "bge 9b\n" "16:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -444,4 +442,5 @@ void Transform<16, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp index 20f9d39f4e..46211ad4e4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -158,7 +158,6 @@ void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -268,4 +267,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp index 22d68acd51..1cb7bc4445 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -762,7 +762,6 @@ void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, si "add %x[out], %x[out], #0xc0\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" @@ -783,4 +782,5 @@ void Transform<24, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp index 799a9cd91d..dcaf69d2a8 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -198,7 +198,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -271,7 +270,6 @@ void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -292,4 +290,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp index 621c5f99ff..966b75664e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -270,7 +270,6 @@ void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t w "add %x[out], %x[out], #0x30\n" "bge 11b\n" "20:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -291,4 +290,5 @@ void Transform<12, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp index 5cd7bd0512..4a22675028 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -503,4 +503,5 @@ void Transform<32, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp index 706d7cd359..237536697c 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -280,7 +280,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w "bge 1b\n" "cbz %x[height], 24f\n" "12:" // Main loop skip - "13:" // Tail row loop: Head "mov x25, %x[in]\n" "mov x20, %x[width]\n" @@ -427,7 +426,6 @@ void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t w "add %x[out], %x[out], #0x80\n" "bge 13b\n" "24:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25" @@ -449,4 +447,5 @@ void Transform<32, 2, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp index b4827525cd..f35752d5a8 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -137,7 +137,6 @@ void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -241,4 +240,5 @@ void Transform<6, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp index e1ab14e594..6ef02ac044 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -39,7 +39,6 @@ void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t wid size_t out_stride = 4 * roundup(height, 16) * sizeof(uint8_t); __asm__ __volatile__( - "1:" // Main row loop: Head "mov x17, %x[in]\n" "add x16, x17, %x[in_stride]\n" @@ -316,4 +315,5 @@ void Transform<4, 16, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp index 8adc69e8b3..5667820865 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -333,4 +333,5 @@ void Transform<4, 4, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp index 07602bdc8d..328274a488 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -145,7 +145,6 @@ void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -251,4 +250,5 @@ void Transform<32, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp index a048fbb109..feb469ab0e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp @@ -24,7 +24,7 @@ #pragma once -#ifdef __aarch64__ +#if defined(__aarch64__) namespace { @@ -177,7 +177,6 @@ void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width "bge 1b\n" "cbz %x[height], 20f\n" "10:" // Main loop skip - "11:" // Tail row loop: Head "mov x20, %x[width]\n" "mov x25, %x[in]\n" @@ -265,4 +264,5 @@ void Transform<24, 1, true, VLType::None>( ); } -#endif + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp index 01921c5ad9..a4d480c405 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -140,4 +139,5 @@ void Transform<16, 1, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp index 6b9b471fdc..552abfc1c6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -178,4 +177,5 @@ void Transform<16, 4, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp index 96128cf9c2..9c6f5c83a1 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -150,4 +149,5 @@ void Transform<16, 2, true, VLType::SME>( ); } -#endif // __ARM_FEATURE_SVE + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp index 080db1c5c1..2756327815 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_16VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -197,4 +196,5 @@ void Transform<16, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp index 7e496095f4..a6ddb8fec0 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -210,4 +209,5 @@ void Transform<1, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp index 45d3c0729e..399a52e233 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -146,4 +145,5 @@ void Transform<1, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp index 7120d1d33e..6318e29a79 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -206,4 +205,4 @@ void Transform<1, 2, true, VLType::SME>( ); } -#endif +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp index 72e7b0c99a..b90063028d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_1VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -219,4 +218,5 @@ void Transform<1, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp index a057fd514e..f827197ab7 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -222,4 +221,5 @@ void Transform<2, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp index 9eb4075677..c471d66e17 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -146,4 +145,5 @@ void Transform<2, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp index 3fc3920500..5f967fa615 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -208,4 +207,5 @@ void Transform<2, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp index 9d402a2d58..f22b833821 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_2VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -236,4 +235,5 @@ void Transform<2, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp index 362bebbea0..14636e3218 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -185,4 +184,5 @@ void Transform<4, 1, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp index cbcc0b4c8b..2d46a481f3 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -121,4 +120,5 @@ void Transform<4, 4, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp index 9b28578217..002a12479a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -168,4 +167,5 @@ void Transform<4, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp index 8873070019..2a43f34f71 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_4VL_2x2_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SME) namespace { @@ -186,4 +185,5 @@ void Transform<4, 2, true, VLType::SME>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp new file mode 100644 index 0000000000..be9ad666a9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL.hpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + size_t out_stride = 8 * height * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "cmp %x[height], #0x2\n" + "ptrue p7.b\n" + "blt 4f\n" + "1:" // Main row loop: Head + "mov x25, %x[in]\n" + "add x24, x25, %x[in_stride]\n" + "add %x[in], x24, %x[in_stride]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x2\n" + "mov x22, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x21, x22\n" + "whilelt p0.h, XZR, x21\n" + "ld1h { z31.h }, p0/Z, [x25]\n" + "dech x21\n" + "whilelt p6.h, XZR, x21\n" + "ld1h { z30.h }, p6/Z, [x25, #1, MUL VL]\n" + "dech x21\n" + "whilelt p5.h, XZR, x21\n" + "ld1h { z29.h }, p5/Z, [x25, #2, MUL VL]\n" + "dech x21\n" + "whilelt p4.h, XZR, x21\n" + "ld1h { z28.h }, p4/Z, [x25, #3, MUL VL]\n" + "dech x21\n" + "whilelt p3.h, XZR, x21\n" + "ld1h { z27.h }, p3/Z, [x25, #4, MUL VL]\n" + "dech x21\n" + "whilelt p2.h, XZR, x21\n" + "ld1h { z26.h }, p2/Z, [x25, #5, MUL VL]\n" + "dech x21\n" + "whilelt p1.h, XZR, x21\n" + "ld1h { z25.h }, p1/Z, [x25, #6, MUL VL]\n" + "dech x21\n" + "mov x20, x23\n" + "ld1h { z24.h }, p0/Z, [x24]\n" + "whilelt p0.h, XZR, x21\n" + "dech x22, ALL, MUL #8\n" + "ld1h { z23.h }, p0/Z, [x25, #7, MUL VL]\n" + "ld1h { z22.h }, p6/Z, [x24, #1, MUL VL]\n" + "cmp x22, #0x0\n" + "addvl x25, x25, #8\n" + "ld1h { z21.h }, p5/Z, [x24, #2, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "ld1h { z20.h }, p4/Z, [x24, #3, MUL VL]\n" + "ld1h { z19.h }, p3/Z, [x24, #4, MUL VL]\n" + "ld1h { z18.h }, p2/Z, [x24, #5, MUL VL]\n" + "ld1h { z17.h }, p1/Z, [x24, #6, MUL VL]\n" + "ld1h { z16.h }, p0/Z, [x24, #7, MUL VL]\n" + "st1h { z31.h }, p7, [x20]\n" + "addvl x24, x24, #8\n" + "st1h { z30.h }, p7, [x20, #1, MUL VL]\n" + "st1h { z29.h }, p7, [x20, #2, MUL VL]\n" + "st1h { z28.h }, p7, [x20, #3, MUL VL]\n" + "st1h { z27.h }, p7, [x20, #4, MUL VL]\n" + "st1h { z26.h }, p7, [x20, #5, MUL VL]\n" + "st1h { z25.h }, p7, [x20, #6, MUL VL]\n" + "st1h { z23.h }, p7, [x20, #7, MUL VL]\n" + "addvl x20, x20, #16\n" + "st1h { z24.h }, p7, [x20, #-8, MUL VL]\n" + "st1h { z22.h }, p7, [x20, #-7, MUL VL]\n" + "st1h { z21.h }, p7, [x20, #-6, MUL VL]\n" + "st1h { z20.h }, p7, [x20, #-5, MUL VL]\n" + "st1h { z19.h }, p7, [x20, #-4, MUL VL]\n" + "st1h { z18.h }, p7, [x20, #-3, MUL VL]\n" + "st1h { z17.h }, p7, [x20, #-2, MUL VL]\n" + "st1h { z16.h }, p7, [x20, #-1, MUL VL]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x2\n" + "addvl %x[out], %x[out], #16\n" + "bge 1b\n" + "cbz %x[height], 8f\n" + "4:" // Main loop skip + "5:" // Tail row loop: Head + "mov x25, %x[in]\n" + "add %x[in], x25, %x[in_stride]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x1\n" + "mov x21, %x[width]\n" + "6:" // Tail row loop: Column loop + "mov x20, x21\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z23.h }, p0/Z, [x25]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z22.h }, p0/Z, [x25, #1, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z21.h }, p0/Z, [x25, #2, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z20.h }, p0/Z, [x25, #3, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z19.h }, p0/Z, [x25, #4, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z18.h }, p0/Z, [x25, #5, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z17.h }, p0/Z, [x25, #6, MUL VL]\n" + "dech x20\n" + "dech x21, ALL, MUL #8\n" + "whilelt p0.h, XZR, x20\n" + "cmp x21, #0x0\n" + "ld1h { z16.h }, p0/Z, [x25, #7, MUL VL]\n" + "st1h { z23.h }, p7, [x23]\n" + "addvl x25, x25, #8\n" + "st1h { z22.h }, p7, [x23, #1, MUL VL]\n" + "st1h { z21.h }, p7, [x23, #2, MUL VL]\n" + "st1h { z20.h }, p7, [x23, #3, MUL VL]\n" + "st1h { z19.h }, p7, [x23, #4, MUL VL]\n" + "st1h { z18.h }, p7, [x23, #5, MUL VL]\n" + "st1h { z17.h }, p7, [x23, #6, MUL VL]\n" + "st1h { z16.h }, p7, [x23, #7, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "bgt 6b\n" + "7:" // Tail row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 5b\n" + "8:" // Done + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 1, true, VLType::SME>( + float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(float) / 2, + stride * sizeof(float), + (kmax-k0) + ); +} + +template<> +void Transform<8, 1, true, VLType::SME>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +template<> +void Transform<8, 1, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp new file mode 100644 index 0000000000..45d2e24258 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_1x4.hpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height) +{ + uint8_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint8_t))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint8_t)); + } + + size_t out_stride = 8 * roundup(height, 4) * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "ptrue p2.b\n" + "1:" // Main row loop: Head + "mov x26, %x[in]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "cmp %x[height], #0x3\n" + "add %x[in], x23, %x[in_stride]\n" + "csel x23, x23, %x[pad_row], GT\n" + "csel x24, x24, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "mov x22, %x[out]\n" + "csel x25, x25, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x4\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "whilelt p1.b, XZR, x20\n" + "ld1b { z19.b }, p1/Z, [x26]\n" + "decb x20\n" + "whilelt p0.b, XZR, x20\n" + "ld1b { z17.b }, p0/Z, [x26, #1, MUL VL]\n" + "ld1b { z18.b }, p1/Z, [x25]\n" + "decw x21, ALL, MUL #8\n" + "cmp x21, #0x0\n" + "ld1b { z21.b }, p0/Z, [x25, #1, MUL VL]\n" + "addvl x26, x26, #2\n" + "addvl x25, x25, #2\n" + "ld1b { z16.b }, p1/Z, [x24]\n" + "zip1 z24.b, z19.b, z16.b\n" + "zip2 z20.b, z19.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x24, #1, MUL VL]\n" + "zip1 z23.b, z17.b, z16.b\n" + "zip2 z22.b, z17.b, z16.b\n" + "addvl x24, x24, #2\n" + "ld1b { z16.b }, p1/Z, [x23]\n" + "zip1 z17.b, z18.b, z16.b\n" + "zip2 z19.b, z18.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x23, #1, MUL VL]\n" + "zip1 z18.b, z21.b, z16.b\n" + "zip2 z21.b, z21.b, z16.b\n" + "addvl x23, x23, #2\n" + "zip1 z16.b, z24.b, z17.b\n" + "zip2 z17.b, z24.b, z17.b\n" + "st1b { z16.b }, p2, [x22]\n" + "zip1 z16.b, z20.b, z19.b\n" + "zip2 z20.b, z20.b, z19.b\n" + "st1b { z17.b }, p2, [x22, #1, MUL VL]\n" + "zip1 z19.b, z23.b, z18.b\n" + "zip2 z18.b, z23.b, z18.b\n" + "st1b { z16.b }, p2, [x22, #2, MUL VL]\n" + "zip1 z17.b, z22.b, z21.b\n" + "zip2 z16.b, z22.b, z21.b\n" + "st1b { z20.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z19.b }, p2, [x22, #4, MUL VL]\n" + "st1b { z18.b }, p2, [x22, #5, MUL VL]\n" + "st1b { z17.b }, p2, [x22, #6, MUL VL]\n" + "st1b { z16.b }, p2, [x22, #7, MUL VL]\n" + "add x22, x22, %x[out_stride]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 1b\n" + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 4, true, VLType::SME>( + uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(uint8_t) / 1, + stride * sizeof(uint8_t), + (kmax-k0) + ); +} + +template<> +void Transform<8, 4, true, VLType::SME>( + int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_1x4( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(int8_t) / 1, + stride * sizeof(int8_t), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp new file mode 100644 index 0000000000..ec7c415e27 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sme_transpose_interleave_8VL_2x2.hpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SME) + +namespace { + +void sme_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height) +{ + uint16_t *pad_row = reinterpret_cast(alloca(width * sizeof(uint16_t))); + + if (height % 2) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = 8 * roundup(height, 2) * sme::get_vector_length(); + + __asm__ __volatile__( + ".inst 0xd503477f // SMSTART ZA\n" + "ptrue p4.b\n" + "1:" // Main row loop: Head + "mov x24, %x[in]\n" + "add x23, x24, %x[in_stride]\n" + "cmp %x[height], #0x1\n" + "add %x[in], x23, %x[in_stride]\n" + "mov x22, %x[out]\n" + "csel x23, x23, %x[pad_row], GT\n" + "sub %x[height], %x[height], #0x2\n" + "mov x21, %x[width]\n" + "2:" // Main row loop: Column loop + "mov x20, x21\n" + "whilelt p3.h, XZR, x20\n" + "ld1h { z20.h }, p3/Z, [x24]\n" + "dech x20\n" + "whilelt p2.h, XZR, x20\n" + "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n" + "dech x20\n" + "whilelt p1.h, XZR, x20\n" + "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n" + "dech x20\n" + "whilelt p0.h, XZR, x20\n" + "ld1h { z24.h }, p0/Z, [x24, #3, MUL VL]\n" + "ld1h { z17.h }, p3/Z, [x23]\n" + "decw x21, ALL, MUL #8\n" + "cmp x21, #0x0\n" + "zip1 z23.h, z20.h, z17.h\n" + "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n" + "addvl x24, x24, #4\n" + "zip2 z22.h, z20.h, z17.h\n" + "zip1 z21.h, z19.h, z16.h\n" + "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n" + "zip2 z20.h, z19.h, z16.h\n" + "zip1 z19.h, z18.h, z17.h\n" + "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n" + "addvl x23, x23, #4\n" + "zip2 z18.h, z18.h, z17.h\n" + "zip1 z17.h, z24.h, z16.h\n" + "zip2 z16.h, z24.h, z16.h\n" + "st1h { z23.h }, p4, [x22]\n" + "st1h { z22.h }, p4, [x22, #1, MUL VL]\n" + "st1h { z21.h }, p4, [x22, #2, MUL VL]\n" + "st1h { z20.h }, p4, [x22, #3, MUL VL]\n" + "st1h { z19.h }, p4, [x22, #4, MUL VL]\n" + "st1h { z18.h }, p4, [x22, #5, MUL VL]\n" + "st1h { z17.h }, p4, [x22, #6, MUL VL]\n" + "st1h { z16.h }, p4, [x22, #7, MUL VL]\n" + "add x22, x22, %x[out_stride]\n" + "bgt 2b\n" + "3:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #8\n" + "bge 1b\n" + ".inst 0xd503467f // SMSTOP\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // anonymous namespace + +template<> +void Transform<8, 2, true, VLType::SME>( + bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(bfloat16) / 2, + stride * sizeof(bfloat16), + (kmax-k0) + ); +} + +template<> +void Transform<8, 2, true, VLType::SME>( + __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sme_transpose_interleave_8VL_2x2( + reinterpret_cast(out), + reinterpret_cast(in + k0 * stride + x0), + (xmax-x0) * sizeof(__fp16) / 2, + stride * sizeof(__fp16), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SME) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp index 847718992a..f627fe575f 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -373,4 +372,5 @@ void Transform<12, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp index 74fce4ddf9..b33c4f6c2d 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -101,7 +100,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x21, %x[width]\n" "cntw x20, ALL, MUL #2\n" @@ -138,7 +136,6 @@ void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t widt "addvl %x[out], %x[out], #1\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23" @@ -160,4 +157,5 @@ void Transform<1, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp index a034be5e74..e468787815 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -305,4 +304,5 @@ void Transform<1, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp index 82d4184061..546800fa69 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -93,7 +92,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt "bge 1b\n" "cbz %x[height], 8f\n" "4:" // Main loop skip - "5:" // Tail row loop: Head "mov x26, %x[in]\n" "add %x[in], x26, %x[in_stride]\n" @@ -123,7 +121,6 @@ void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t widt "addvl %x[out], %x[out], #3\n" "bge 5b\n" "8:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27" @@ -171,4 +168,5 @@ void Transform<3, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp index ec7095db7b..a44141c109 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -329,7 +328,6 @@ void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t wi "addvl %x[out], %x[out], #3\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -364,4 +362,5 @@ void Transform<3, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp index 3d14383a64..36a15a16b3 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -292,7 +291,6 @@ void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #3\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -314,4 +312,5 @@ void Transform<3, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp index a39235187f..e661e2698a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -103,7 +102,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt "bge 1b\n" "cbz %x[height], 8f\n" "4:" // Main loop skip - "5:" // Tail row loop: Head "mov x26, %x[in]\n" "add %x[in], x26, %x[in_stride]\n" @@ -137,7 +135,6 @@ void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t widt "addvl %x[out], %x[out], #4\n" "bge 5b\n" "8:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -185,4 +182,5 @@ void Transform<4, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp index e3489398d4..03a78f72f1 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -317,4 +316,5 @@ void Transform<4, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp index 9505dc5e6d..b196799cfe 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -248,7 +247,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x12, %x[in]\n" "mov x21, %x[width]\n" @@ -323,7 +321,6 @@ void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #4\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -345,4 +342,5 @@ void Transform<4, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp index 982c0545ed..68fe2d0cbe 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -292,4 +291,5 @@ void Transform<6, 8, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp index 2b5741a49c..910fc6cb02 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -260,7 +259,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t "bge 1b\n" "cbz %x[height], 12f\n" "6:" // Main loop skip - "7:" // Tail row loop: Head "mov x12, %x[in]\n" "add x11, x12, %x[in_stride]\n" @@ -386,7 +384,6 @@ void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #6\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -408,4 +405,5 @@ void Transform<6, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp index 146da33869..f0f10d2f43 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -235,4 +234,5 @@ void Transform<6, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp index f6fc5e8b84..c638eaacde 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -296,7 +295,6 @@ void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t "addvl %x[out], %x[out], #6\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -318,4 +316,5 @@ void Transform<6, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp index 07147acd8e..0526bd0596 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -281,7 +280,6 @@ void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t widt "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width) : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -303,4 +301,5 @@ void Transform<8, 1, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp index 3ba50fee60..98f0770d77 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -283,4 +282,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp index 6b5ca38ab1..3fa5292143 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -256,4 +255,5 @@ void Transform<8, 8, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp index 237e9b684f..02977ecf1e 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -354,7 +353,6 @@ void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -376,4 +374,5 @@ void Transform<8, 2, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp index 51cae7dd5a..34799c60a6 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -439,7 +438,6 @@ void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t "addvl %x[out], %x[out], #8\n" "bge 7b\n" "12:" // Done - : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" @@ -461,4 +459,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp index 4ad882870e..5a48e579ae 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp @@ -24,8 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE - +#if defined(ARM_COMPUTE_ENABLE_SVE) namespace { @@ -279,4 +278,5 @@ void Transform<8, 4, true, VLType::SVE>( ); } -#endif + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index a28ddadc68..11b1bd3e05 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,7 +80,8 @@ inline T roundup(const T a, const T b) { enum class VLType { None, SVE, - SME + SME, + SME2 }; template -- cgit v1.2.1